Skip to main content

aft/
semantic_index.rs

1use crate::cache_freshness::{self, FileFreshness, FreshnessVerdict};
2use crate::config::{SemanticBackend, SemanticBackendConfig};
3use crate::parser::{detect_language, extract_symbols_from_tree, grammar_for};
4use crate::search_index::{cache_relative_path, cached_path_under_root};
5use crate::symbols::{Symbol, SymbolKind};
6use crate::{slog_info, slog_warn};
7
8use fastembed::{EmbeddingModel as FastembedEmbeddingModel, InitOptions, TextEmbedding};
9use rayon::prelude::*;
10use reqwest::blocking::Client;
11use serde::{Deserialize, Serialize};
12use std::collections::{HashMap, HashSet, VecDeque};
13use std::env;
14use std::fmt::Display;
15use std::fs;
16use std::path::{Path, PathBuf};
17use std::time::Duration;
18use std::time::SystemTime;
19use tree_sitter::Parser;
20use url::Url;
21
22const DEFAULT_DIMENSION: usize = 384;
23const MAX_ENTRIES: usize = 1_000_000;
24const MAX_DIMENSION: usize = 1024;
25const F32_BYTES: usize = std::mem::size_of::<f32>();
26const HEADER_BYTES_V1: usize = 9;
27const HEADER_BYTES_V2: usize = 13;
28const ONNX_RUNTIME_INSTALL_HINT: &str =
29    "ONNX Runtime not found. Install via: brew install onnxruntime (macOS) or apt install libonnxruntime (Linux).";
30
31const SEMANTIC_INDEX_VERSION_V1: u8 = 1;
32const SEMANTIC_INDEX_VERSION_V2: u8 = 2;
33/// V3 adds subsec_nanos to the file-mtime table so staleness detection survives
34/// restart round-trips on filesystems with subsecond mtime precision (APFS,
35/// ext4 with nsec, NTFS). V1/V2 persisted whole-second mtimes only, which
36/// caused every restart to flag ~99% of files as stale and re-embed them.
37const SEMANTIC_INDEX_VERSION_V3: u8 = 3;
38/// V4 keeps the V3 on-disk layout but rebuilds persisted snippets once after
39/// fixing symbol ranges that were incorrectly treated as 1-based.
40const SEMANTIC_INDEX_VERSION_V4: u8 = 4;
41/// V5 adds file sizes to the file metadata table so incremental staleness
42/// detection can catch content changes even when mtime precision misses them.
43const SEMANTIC_INDEX_VERSION_V5: u8 = 5;
44/// V6 stores paths relative to project_root and adds content hashes.
45const SEMANTIC_INDEX_VERSION_V6: u8 = 6;
46const DEFAULT_OPENAI_EMBEDDING_PATH: &str = "/embeddings";
47const DEFAULT_OLLAMA_EMBEDDING_PATH: &str = "/api/embed";
48// Must stay below the bridge timeout (30s) to avoid bridge kills on slow backends.
49const DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS: u64 = 25_000;
50const DEFAULT_MAX_BATCH_SIZE: usize = 64;
51const QUERY_EMBEDDING_CACHE_CAP: usize = 1_000;
52const FALLBACK_BACKEND: &str = "none";
53const EMBEDDING_REQUEST_MAX_ATTEMPTS: usize = 3;
54const EMBEDDING_REQUEST_BACKOFF_MS: [u64; 2] = [500, 1_000];
55
56#[derive(Debug, Clone, Serialize, Deserialize)]
57pub struct SemanticIndexFingerprint {
58    pub backend: String,
59    pub model: String,
60    #[serde(default)]
61    pub base_url: String,
62    pub dimension: usize,
63    #[serde(default = "default_chunking_version")]
64    pub chunking_version: u32,
65}
66
67fn default_chunking_version() -> u32 {
68    2
69}
70
71impl SemanticIndexFingerprint {
72    fn from_config(config: &SemanticBackendConfig, dimension: usize) -> Self {
73        // Use normalized URL for fingerprinting so cosmetic differences
74        // (e.g. "http://host/v1" vs "http://host/v1/") don't cause rebuilds.
75        let base_url = config
76            .base_url
77            .as_ref()
78            .and_then(|u| normalize_base_url(u).ok())
79            .unwrap_or_else(|| FALLBACK_BACKEND.to_string());
80        Self {
81            backend: config.backend.as_str().to_string(),
82            model: config.model.clone(),
83            base_url,
84            dimension,
85            chunking_version: default_chunking_version(),
86        }
87    }
88
89    pub fn as_string(&self) -> String {
90        serde_json::to_string(self).unwrap_or_else(|_| String::new())
91    }
92
93    fn matches_expected(&self, expected: &str) -> bool {
94        let encoded = self.as_string();
95        !encoded.is_empty() && encoded == expected
96    }
97}
98
99enum SemanticEmbeddingEngine {
100    Fastembed(TextEmbedding),
101    OpenAiCompatible {
102        client: Client,
103        model: String,
104        base_url: String,
105        api_key: Option<String>,
106    },
107    Ollama {
108        client: Client,
109        model: String,
110        base_url: String,
111    },
112}
113
114pub struct SemanticEmbeddingModel {
115    backend: SemanticBackend,
116    model: String,
117    base_url: Option<String>,
118    timeout_ms: u64,
119    max_batch_size: usize,
120    dimension: Option<usize>,
121    engine: SemanticEmbeddingEngine,
122    query_embedding_cache: HashMap<String, Vec<f32>>,
123    query_embedding_cache_order: VecDeque<String>,
124    query_embedding_cache_hits: u64,
125    query_embedding_cache_misses: u64,
126}
127
128pub type EmbeddingModel = SemanticEmbeddingModel;
129
130fn validate_embedding_batch(
131    vectors: &[Vec<f32>],
132    expected_count: usize,
133    context: &str,
134) -> Result<(), String> {
135    if expected_count > 0 && vectors.is_empty() {
136        return Err(format!(
137            "{context} returned no vectors for {expected_count} inputs"
138        ));
139    }
140
141    if vectors.len() != expected_count {
142        return Err(format!(
143            "{context} returned {} vectors for {} inputs",
144            vectors.len(),
145            expected_count
146        ));
147    }
148
149    let Some(first_vector) = vectors.first() else {
150        return Ok(());
151    };
152    let expected_dimension = first_vector.len();
153    for (index, vector) in vectors.iter().enumerate() {
154        if vector.len() != expected_dimension {
155            return Err(format!(
156                "{context} returned inconsistent embedding dimensions: vector 0 has length {expected_dimension}, vector {index} has length {}",
157                vector.len()
158            ));
159        }
160    }
161
162    Ok(())
163}
164
165/// Normalize a base URL: validate scheme and strip trailing slash.
166/// Does NOT perform SSRF/private-IP validation — call
167/// `validate_base_url_no_ssrf` separately when processing user-supplied config.
168fn normalize_base_url(raw: &str) -> Result<String, String> {
169    let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
170    let scheme = parsed.scheme();
171    if scheme != "http" && scheme != "https" {
172        return Err(format!(
173            "unsupported URL scheme '{}' — only http:// and https:// are allowed",
174            scheme
175        ));
176    }
177    Ok(parsed.to_string().trim_end_matches('/').to_string())
178}
179
180/// Validate that a base URL does not point to a private/loopback address.
181/// Call this on user-supplied config (at configure time) to prevent SSRF.
182/// Not called for programmatically constructed configs (e.g. tests).
183///
184/// **Loopback is allowed.** Self-hosted embedding backends (e.g. Ollama at
185/// `http://127.0.0.1:11434`) are a primary use case for `aft_search`. Loopback
186/// addresses by definition cannot be exploited as SSRF targets — they only
187/// reach services on the same machine. Allowing loopback unblocks Ollama at its
188/// default config without opening up SSRF to LAN/intranet services, which
189/// remain rejected.
190///
191/// **mDNS `.local` is rejected.** mDNS hostnames typically resolve to LAN
192/// devices (printers, homelab servers); rejecting them before DNS lookup keeps
193/// the SSRF guard meaningful for non-loopback private networks.
194pub fn validate_base_url_no_ssrf(raw: &str) -> Result<(), String> {
195    use std::net::{IpAddr, ToSocketAddrs};
196
197    let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
198
199    let host = parsed.host_str().unwrap_or("");
200
201    // Loopback hostnames are explicitly allowed. RFC 6761 mandates that
202    // `localhost` and `*.localhost` resolve to loopback;
203    // `localhost.localdomain` is a historical alias used on some Linux
204    // distros. Self-hosted backends like Ollama use these by default.
205    let is_loopback_host =
206        host == "localhost" || host == "localhost.localdomain" || host.ends_with(".localhost");
207    if is_loopback_host {
208        return Ok(());
209    }
210
211    // mDNS hostnames are typically LAN devices, not loopback. Reject before
212    // DNS lookup so users get a clear error rather than a private-IP error.
213    if host.ends_with(".local") {
214        return Err(format!(
215            "base_url host '{host}' is an mDNS name — only loopback (localhost / 127.0.0.1) and public endpoints are allowed"
216        ));
217    }
218
219    // Resolve the hostname. Reject private/link-local/CGNAT IPs but NOT
220    // loopback (which is by definition same-machine and not an SSRF target).
221    let port = parsed.port_or_known_default().unwrap_or(443);
222    let addr_str = format!("{host}:{port}");
223    let addrs: Vec<IpAddr> = addr_str
224        .to_socket_addrs()
225        .map(|iter| iter.map(|sa| sa.ip()).collect())
226        .unwrap_or_default();
227    for ip in &addrs {
228        if is_private_non_loopback_ip(ip) {
229            return Err(format!(
230                "base_url '{raw}' resolves to a private/reserved IP — only loopback (127.0.0.1) and public endpoints are allowed"
231            ));
232        }
233    }
234
235    Ok(())
236}
237
238/// Returns true for IPv4/IPv6 addresses in private/link-local/CGNAT/wildcard
239/// ranges, EXCLUDING loopback (127.0.0.0/8 and ::1). Loopback is considered
240/// safe for SSRF purposes — see [`validate_base_url_no_ssrf`] for rationale.
241fn is_private_non_loopback_ip(ip: &std::net::IpAddr) -> bool {
242    use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
243    match ip {
244        IpAddr::V4(v4) => {
245            let o = v4.octets();
246            // Note: 127.0.0.0/8 (loopback) is intentionally NOT in this set.
247            // 10.0.0.0/8
248            o[0] == 10
249            // 172.16.0.0/12
250            || (o[0] == 172 && (16..=31).contains(&o[1]))
251            // 192.168.0.0/16
252            || (o[0] == 192 && o[1] == 168)
253            // 169.254.0.0/16 link-local
254            || (o[0] == 169 && o[1] == 254)
255            // 100.64.0.0/10 CGNAT
256            || (o[0] == 100 && (64..=127).contains(&o[1]))
257            // 0.0.0.0/8 wildcard
258            || o[0] == 0
259        }
260        IpAddr::V6(v6) => {
261            // Note: ::1 (loopback) is intentionally NOT in this set.
262            let _ = Ipv6Addr::LOCALHOST; // touch to silence unused-import lints in some builds
263                                         // fe80::/10 link-local
264            (v6.segments()[0] & 0xffc0) == 0xfe80
265            // fc00::/7 unique-local
266            || (v6.segments()[0] & 0xfe00) == 0xfc00
267            // ::ffff:0:0/96 IPv4-mapped — check the embedded IPv4
268            || (v6.segments()[0] == 0 && v6.segments()[1] == 0
269                && v6.segments()[2] == 0 && v6.segments()[3] == 0
270                && v6.segments()[4] == 0 && v6.segments()[5] == 0xffff
271                && {
272                    let [a, b] = v6.segments()[6..8] else { return false; };
273                    let ipv4 = Ipv4Addr::new((a >> 8) as u8, (a & 0xff) as u8, (b >> 8) as u8, (b & 0xff) as u8);
274                    is_private_non_loopback_ip(&IpAddr::V4(ipv4))
275                })
276        }
277    }
278}
279
280fn build_openai_embeddings_endpoint(base_url: &str) -> String {
281    if base_url.ends_with("/v1") {
282        format!("{base_url}{DEFAULT_OPENAI_EMBEDDING_PATH}")
283    } else {
284        format!("{base_url}/v1{}", DEFAULT_OPENAI_EMBEDDING_PATH)
285    }
286}
287
288fn build_ollama_embeddings_endpoint(base_url: &str) -> String {
289    if base_url.ends_with("/api") {
290        format!("{base_url}/embed")
291    } else {
292        format!("{base_url}{DEFAULT_OLLAMA_EMBEDDING_PATH}")
293    }
294}
295
296fn normalize_api_key(value: Option<String>) -> Option<String> {
297    value.and_then(|token| {
298        let token = token.trim();
299        if token.is_empty() {
300            None
301        } else {
302            Some(token.to_string())
303        }
304    })
305}
306
307fn is_retryable_embedding_status(status: reqwest::StatusCode) -> bool {
308    status.is_server_error() || status == reqwest::StatusCode::TOO_MANY_REQUESTS
309}
310
311fn is_retryable_embedding_error(error: &reqwest::Error) -> bool {
312    error.is_connect()
313}
314
315fn sleep_before_embedding_retry(attempt_index: usize) {
316    if let Some(delay_ms) = EMBEDDING_REQUEST_BACKOFF_MS.get(attempt_index) {
317        std::thread::sleep(Duration::from_millis(*delay_ms));
318    }
319}
320
321fn send_embedding_request<F>(mut make_request: F, backend_label: &str) -> Result<String, String>
322where
323    F: FnMut() -> reqwest::blocking::RequestBuilder,
324{
325    for attempt_index in 0..EMBEDDING_REQUEST_MAX_ATTEMPTS {
326        let last_attempt = attempt_index + 1 == EMBEDDING_REQUEST_MAX_ATTEMPTS;
327
328        let response = match make_request().send() {
329            Ok(response) => response,
330            Err(error) => {
331                if !last_attempt && is_retryable_embedding_error(&error) {
332                    sleep_before_embedding_retry(attempt_index);
333                    continue;
334                }
335                return Err(format!("{backend_label} request failed: {error}"));
336            }
337        };
338
339        let status = response.status();
340        let raw = match response.text() {
341            Ok(raw) => raw,
342            Err(error) => {
343                if !last_attempt && is_retryable_embedding_error(&error) {
344                    sleep_before_embedding_retry(attempt_index);
345                    continue;
346                }
347                return Err(format!("{backend_label} response read failed: {error}"));
348            }
349        };
350
351        if status.is_success() {
352            return Ok(raw);
353        }
354
355        if !last_attempt && is_retryable_embedding_status(status) {
356            sleep_before_embedding_retry(attempt_index);
357            continue;
358        }
359
360        return Err(format!(
361            "{backend_label} request failed (HTTP {}): {}",
362            status, raw
363        ));
364    }
365
366    unreachable!("embedding request retries exhausted without returning")
367}
368
369impl SemanticEmbeddingModel {
370    pub fn from_config(config: &SemanticBackendConfig) -> Result<Self, String> {
371        let timeout_ms = if config.timeout_ms == 0 {
372            DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS
373        } else {
374            config.timeout_ms
375        };
376
377        let max_batch_size = if config.max_batch_size == 0 {
378            DEFAULT_MAX_BATCH_SIZE
379        } else {
380            config.max_batch_size
381        };
382
383        let api_key_env = normalize_api_key(config.api_key_env.clone());
384        let model = config.model.clone();
385
386        let client = Client::builder()
387            .timeout(Duration::from_millis(timeout_ms))
388            .redirect(reqwest::redirect::Policy::none())
389            .build()
390            .map_err(|error| format!("failed to configure embedding client: {error}"))?;
391
392        let engine = match config.backend {
393            SemanticBackend::Fastembed => {
394                SemanticEmbeddingEngine::Fastembed(initialize_text_embedding(&model)?)
395            }
396            SemanticBackend::OpenAiCompatible => {
397                let raw = config.base_url.as_ref().ok_or_else(|| {
398                    "base_url is required for openai_compatible backend".to_string()
399                })?;
400                let base_url = normalize_base_url(raw)?;
401
402                let api_key = match api_key_env {
403                    Some(var_name) => Some(env::var(&var_name).map_err(|_| {
404                        format!("missing api_key_env '{var_name}' for openai_compatible backend")
405                    })?),
406                    None => None,
407                };
408
409                SemanticEmbeddingEngine::OpenAiCompatible {
410                    client,
411                    model,
412                    base_url,
413                    api_key,
414                }
415            }
416            SemanticBackend::Ollama => {
417                let raw = config
418                    .base_url
419                    .as_ref()
420                    .ok_or_else(|| "base_url is required for ollama backend".to_string())?;
421                let base_url = normalize_base_url(raw)?;
422
423                SemanticEmbeddingEngine::Ollama {
424                    client,
425                    model,
426                    base_url,
427                }
428            }
429        };
430
431        Ok(Self {
432            backend: config.backend,
433            model: config.model.clone(),
434            base_url: config.base_url.clone(),
435            timeout_ms,
436            max_batch_size,
437            dimension: None,
438            engine,
439            query_embedding_cache: HashMap::new(),
440            query_embedding_cache_order: VecDeque::new(),
441            query_embedding_cache_hits: 0,
442            query_embedding_cache_misses: 0,
443        })
444    }
445
446    pub fn backend(&self) -> SemanticBackend {
447        self.backend
448    }
449
450    pub fn model(&self) -> &str {
451        &self.model
452    }
453
454    pub fn base_url(&self) -> Option<&str> {
455        self.base_url.as_deref()
456    }
457
458    pub fn max_batch_size(&self) -> usize {
459        self.max_batch_size
460    }
461
462    pub fn timeout_ms(&self) -> u64 {
463        self.timeout_ms
464    }
465
466    pub fn fingerprint(
467        &mut self,
468        config: &SemanticBackendConfig,
469    ) -> Result<SemanticIndexFingerprint, String> {
470        let dimension = self.dimension()?;
471        Ok(SemanticIndexFingerprint::from_config(config, dimension))
472    }
473
474    pub fn dimension(&mut self) -> Result<usize, String> {
475        if let Some(dimension) = self.dimension {
476            return Ok(dimension);
477        }
478
479        let dimension = match &mut self.engine {
480            SemanticEmbeddingEngine::Fastembed(model) => {
481                let vectors = model
482                    .embed(vec!["semantic index fingerprint probe".to_string()], None)
483                    .map_err(|error| format_embedding_init_error(error.to_string()))?;
484                vectors
485                    .first()
486                    .map(|v| v.len())
487                    .ok_or_else(|| "embedding backend returned no vectors".to_string())?
488            }
489            SemanticEmbeddingEngine::OpenAiCompatible { .. } => {
490                let vectors =
491                    self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
492                vectors
493                    .first()
494                    .map(|v| v.len())
495                    .ok_or_else(|| "embedding backend returned no vectors".to_string())?
496            }
497            SemanticEmbeddingEngine::Ollama { .. } => {
498                let vectors =
499                    self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
500                vectors
501                    .first()
502                    .map(|v| v.len())
503                    .ok_or_else(|| "embedding backend returned no vectors".to_string())?
504            }
505        };
506
507        self.dimension = Some(dimension);
508        Ok(dimension)
509    }
510
511    pub fn embed(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
512        self.embed_texts(texts)
513    }
514
515    pub fn embed_query_cached(&mut self, query: &str) -> Result<Vec<f32>, String> {
516        if let Some(vector) = self.query_embedding_cache.get(query) {
517            self.query_embedding_cache_hits += 1;
518            return Ok(vector.clone());
519        }
520
521        self.query_embedding_cache_misses += 1;
522        let embeddings = self.embed_texts(vec![query.to_string()])?;
523        let vector = embeddings
524            .first()
525            .cloned()
526            .ok_or_else(|| "embedding model returned no query vector".to_string())?;
527
528        if self.query_embedding_cache.len() >= QUERY_EMBEDDING_CACHE_CAP {
529            if let Some(oldest) = self.query_embedding_cache_order.pop_front() {
530                self.query_embedding_cache.remove(&oldest);
531            }
532        }
533        self.query_embedding_cache
534            .insert(query.to_string(), vector.clone());
535        self.query_embedding_cache_order
536            .push_back(query.to_string());
537
538        Ok(vector)
539    }
540
541    pub fn query_embedding_cache_stats(&self) -> (u64, u64, usize) {
542        (
543            self.query_embedding_cache_hits,
544            self.query_embedding_cache_misses,
545            self.query_embedding_cache.len(),
546        )
547    }
548
549    fn embed_texts(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
550        match &mut self.engine {
551            SemanticEmbeddingEngine::Fastembed(model) => model
552                .embed(texts, None::<usize>)
553                .map_err(|error| format_embedding_init_error(error.to_string()))
554                .map_err(|error| format!("failed to embed batch: {error}")),
555            SemanticEmbeddingEngine::OpenAiCompatible {
556                client,
557                model,
558                base_url,
559                api_key,
560            } => {
561                let expected_text_count = texts.len();
562                let endpoint = build_openai_embeddings_endpoint(base_url);
563                let body = serde_json::json!({
564                    "input": texts,
565                    "model": model,
566                });
567
568                let raw = send_embedding_request(
569                    || {
570                        // `.json(&body)` sets Content-Type: application/json
571                        // automatically. Do NOT add `.header("Content-Type",
572                        // "application/json")` afterwards — RequestBuilder::header()
573                        // calls HeaderMap::append, which produces TWO Content-Type
574                        // headers on the wire. OpenAI's /v1/embeddings endpoint
575                        // treats duplicate Content-Type as malformed and rejects
576                        // the body with 400 "you must provide a model parameter"
577                        // even when `model` is set. Verified end-to-end against
578                        // api.openai.com. See issue #36.
579                        let mut request = client.post(&endpoint).json(&body);
580
581                        if let Some(api_key) = api_key {
582                            request = request.header("Authorization", format!("Bearer {api_key}"));
583                        }
584
585                        request
586                    },
587                    "openai compatible",
588                )?;
589
590                #[derive(Deserialize)]
591                struct OpenAiResponse {
592                    data: Vec<OpenAiEmbeddingResult>,
593                }
594
595                #[derive(Deserialize)]
596                struct OpenAiEmbeddingResult {
597                    embedding: Vec<f32>,
598                    index: Option<u32>,
599                }
600
601                let parsed: OpenAiResponse = serde_json::from_str(&raw)
602                    .map_err(|error| format!("invalid openai compatible response: {error}"))?;
603                if parsed.data.len() != expected_text_count {
604                    return Err(format!(
605                        "openai compatible response returned {} embeddings for {} inputs",
606                        parsed.data.len(),
607                        expected_text_count
608                    ));
609                }
610
611                let mut vectors = vec![Vec::new(); parsed.data.len()];
612                for (i, item) in parsed.data.into_iter().enumerate() {
613                    let index = item.index.unwrap_or(i as u32) as usize;
614                    if index >= vectors.len() {
615                        return Err(
616                            "openai compatible response contains invalid vector index".to_string()
617                        );
618                    }
619                    vectors[index] = item.embedding;
620                }
621
622                for vector in &vectors {
623                    if vector.is_empty() {
624                        return Err(
625                            "openai compatible response contained missing vectors".to_string()
626                        );
627                    }
628                }
629
630                self.dimension = vectors.first().map(Vec::len);
631                Ok(vectors)
632            }
633            SemanticEmbeddingEngine::Ollama {
634                client,
635                model,
636                base_url,
637            } => {
638                let expected_text_count = texts.len();
639                let endpoint = build_ollama_embeddings_endpoint(base_url);
640
641                #[derive(Serialize)]
642                struct OllamaPayload<'a> {
643                    model: &'a str,
644                    input: Vec<String>,
645                }
646
647                let payload = OllamaPayload {
648                    model,
649                    input: texts,
650                };
651
652                let raw = send_embedding_request(
653                    || {
654                        // `.json(&payload)` sets Content-Type automatically.
655                        // Same duplicate-header trap as the OpenAI branch above
656                        // — most Ollama servers tolerate it, but the
657                        // single-Content-Type form is the correct one.
658                        client.post(&endpoint).json(&payload)
659                    },
660                    "ollama",
661                )?;
662
663                #[derive(Deserialize)]
664                struct OllamaResponse {
665                    embeddings: Vec<Vec<f32>>,
666                }
667
668                let parsed: OllamaResponse = serde_json::from_str(&raw)
669                    .map_err(|error| format!("invalid ollama response: {error}"))?;
670                if parsed.embeddings.is_empty() {
671                    return Err("ollama response returned no embeddings".to_string());
672                }
673                if parsed.embeddings.len() != expected_text_count {
674                    return Err(format!(
675                        "ollama response returned {} embeddings for {} inputs",
676                        parsed.embeddings.len(),
677                        expected_text_count
678                    ));
679                }
680
681                let vectors = parsed.embeddings;
682                for vector in &vectors {
683                    if vector.is_empty() {
684                        return Err("ollama response contained empty embeddings".to_string());
685                    }
686                }
687
688                self.dimension = vectors.first().map(Vec::len);
689                Ok(vectors)
690            }
691        }
692    }
693}
694
695/// Pre-validate ONNX Runtime by attempting a raw dlopen before ort touches it.
696/// This catches broken/incompatible .so files without risking a panic in the ort crate.
697/// Also checks the runtime version via OrtGetApiBase if available.
698pub fn pre_validate_onnx_runtime() -> Result<(), String> {
699    let dylib_path = std::env::var("ORT_DYLIB_PATH").ok();
700
701    #[cfg(any(target_os = "linux", target_os = "macos"))]
702    {
703        #[cfg(target_os = "linux")]
704        let default_name = "libonnxruntime.so";
705        #[cfg(target_os = "macos")]
706        let default_name = "libonnxruntime.dylib";
707
708        let lib_name = dylib_path.as_deref().unwrap_or(default_name);
709
710        unsafe {
711            let c_name = std::ffi::CString::new(lib_name)
712                .map_err(|e| format!("invalid library path: {}", e))?;
713            let handle = libc::dlopen(c_name.as_ptr(), libc::RTLD_NOW);
714            if handle.is_null() {
715                let err = libc::dlerror();
716                let msg = if err.is_null() {
717                    "unknown dlopen error".to_string()
718                } else {
719                    std::ffi::CStr::from_ptr(err).to_string_lossy().into_owned()
720                };
721                return Err(format!(
722                    "ONNX Runtime not found. dlopen('{}') failed: {}. \
723                     Run `npx @cortexkit/aft doctor` to diagnose.",
724                    lib_name, msg
725                ));
726            }
727
728            // Try to detect the runtime version from the file path or soname.
729            // libonnxruntime.so.1.19.0, libonnxruntime.1.24.4.dylib, etc.
730            let detected_version = detect_ort_version_from_path(lib_name);
731
732            libc::dlclose(handle);
733
734            // Check version compatibility — we need 1.24.x
735            if let Some(ref version) = detected_version {
736                let parts: Vec<&str> = version.split('.').collect();
737                if let (Some(major), Some(minor)) = (
738                    parts.first().and_then(|s| s.parse::<u32>().ok()),
739                    parts.get(1).and_then(|s| s.parse::<u32>().ok()),
740                ) {
741                    if major != 1 || minor < 20 {
742                        return Err(format_ort_version_mismatch(version, lib_name));
743                    }
744                }
745            }
746        }
747    }
748
749    #[cfg(target_os = "windows")]
750    {
751        // On Windows, skip pre-validation — let ort handle LoadLibrary
752        let _ = dylib_path;
753    }
754
755    Ok(())
756}
757
758/// Try to extract the ORT version from the library filename or resolved symlink.
759/// Examples: "libonnxruntime.so.1.19.0" → "1.19.0", "libonnxruntime.1.24.4.dylib" → "1.24.4"
760#[cfg(any(test, target_os = "linux", target_os = "macos"))]
761fn detect_ort_version_from_path(lib_path: &str) -> Option<String> {
762    let path = std::path::Path::new(lib_path);
763
764    // Try the path as given, then follow symlinks
765    for candidate in [Some(path.to_path_buf()), std::fs::canonicalize(path).ok()]
766        .into_iter()
767        .flatten()
768    {
769        if let Some(name) = candidate.file_name().and_then(|n| n.to_str()) {
770            if let Some(version) = extract_version_from_filename(name) {
771                return Some(version);
772            }
773        }
774    }
775
776    // Also check for versioned siblings in the same directory
777    if let Some(parent) = path.parent() {
778        if let Ok(entries) = std::fs::read_dir(parent) {
779            for entry in entries.flatten() {
780                if let Some(name) = entry.file_name().to_str() {
781                    if name.starts_with("libonnxruntime") {
782                        if let Some(version) = extract_version_from_filename(name) {
783                            return Some(version);
784                        }
785                    }
786                }
787            }
788        }
789    }
790
791    None
792}
793
794/// Extract version from filenames like "libonnxruntime.so.1.19.0" or "libonnxruntime.1.24.4.dylib"
795#[cfg(any(test, target_os = "linux", target_os = "macos"))]
796fn extract_version_from_filename(name: &str) -> Option<String> {
797    // Match patterns: .so.X.Y.Z or .X.Y.Z.dylib or .X.Y.Z.so
798    let re = regex::Regex::new(r"(\d+\.\d+\.\d+)").ok()?;
799    re.find(name).map(|m| m.as_str().to_string())
800}
801
802#[cfg(any(test, target_os = "linux", target_os = "macos"))]
803fn suggest_removal_command(lib_path: &str) -> String {
804    if lib_path.starts_with("/usr/local/lib")
805        || lib_path == "libonnxruntime.so"
806        || lib_path == "libonnxruntime.dylib"
807    {
808        #[cfg(target_os = "linux")]
809        return "   sudo rm /usr/local/lib/libonnxruntime* && sudo ldconfig".to_string();
810        #[cfg(target_os = "macos")]
811        return "   sudo rm /usr/local/lib/libonnxruntime*".to_string();
812        #[cfg(target_os = "windows")]
813        return "   Delete the ONNX Runtime DLL from your PATH".to_string();
814    }
815    format!("   rm '{}'", lib_path)
816}
817
818/// Build the user-facing error message for an incompatible ONNX Runtime
819/// install. Extracted as a pure helper so we can unit-test the wording
820/// stability — the auto-fix recommendation must always come first because
821/// it's the only safe option, and the system-rm step must remain present
822/// because some users prefer the system-wide cleanup path.
823#[cfg(any(test, target_os = "linux", target_os = "macos"))]
824pub(crate) fn format_ort_version_mismatch(version: &str, lib_name: &str) -> String {
825    format!(
826        "ONNX Runtime version mismatch: found v{} at '{}', but AFT requires v1.20+. \
827         Solutions:\n\
828         1. Auto-fix (recommended): run `npx @cortexkit/aft doctor --fix`. \
829         This downloads AFT-managed ONNX Runtime v1.24 into AFT's storage and \
830         configures the bridge to load it instead of the system library — no \
831         changes to '{}'.\n\
832         2. Remove the old library and restart (AFT auto-downloads the correct version on next start):\n\
833         {}\n\
834         3. Or install ONNX Runtime 1.24 system-wide: https://github.com/microsoft/onnxruntime/releases/tag/v1.24.0\n\
835         4. Run `npx @cortexkit/aft doctor` for full diagnostics.",
836        version,
837        lib_name,
838        lib_name,
839        suggest_removal_command(lib_name),
840    )
841}
842
843pub fn initialize_text_embedding(model: &str) -> Result<TextEmbedding, String> {
844    // Pre-validate before ort can panic on a bad library
845    pre_validate_onnx_runtime()?;
846
847    let selected_model = match model {
848        "all-MiniLM-L6-v2" | "all-minilm-l6-v2" => FastembedEmbeddingModel::AllMiniLML6V2,
849        _ => {
850            return Err(format!(
851                "unsupported fastembed model '{}'. Supported: all-MiniLM-L6-v2",
852                model
853            ))
854        }
855    };
856
857    TextEmbedding::try_new(InitOptions::new(selected_model)).map_err(format_embedding_init_error)
858}
859
860pub fn is_onnx_runtime_unavailable(message: &str) -> bool {
861    if message.trim_start().starts_with("ONNX Runtime not found.") {
862        return true;
863    }
864
865    let message = message.to_ascii_lowercase();
866    let mentions_onnx_runtime = ["onnx runtime", "onnxruntime", "libonnxruntime"]
867        .iter()
868        .any(|pattern| message.contains(pattern));
869    let mentions_dynamic_load_failure = [
870        "shared library",
871        "dynamic library",
872        "failed to load",
873        "could not load",
874        "unable to load",
875        "dlopen",
876        "loadlibrary",
877        "no such file",
878        "not found",
879    ]
880    .iter()
881    .any(|pattern| message.contains(pattern));
882
883    mentions_onnx_runtime && mentions_dynamic_load_failure
884}
885
886fn format_embedding_init_error(error: impl Display) -> String {
887    let message = error.to_string();
888
889    if is_onnx_runtime_unavailable(&message) {
890        return format!("{ONNX_RUNTIME_INSTALL_HINT} Original error: {message}");
891    }
892
893    format!("failed to initialize semantic embedding model: {message}")
894}
895
896/// A chunk of code ready for embedding — derived from a Symbol with context enrichment
897#[derive(Debug, Clone)]
898pub struct SemanticChunk {
899    /// Absolute file path
900    pub file: PathBuf,
901    /// Symbol name
902    pub name: String,
903    /// Symbol kind (function, class, struct, etc.)
904    pub kind: SymbolKind,
905    /// Line range (0-based internally, inclusive)
906    pub start_line: u32,
907    pub end_line: u32,
908    /// Whether the symbol is exported
909    pub exported: bool,
910    /// The enriched text that gets embedded (scope + signature + body snippet)
911    pub embed_text: String,
912    /// Short code snippet for display in results
913    pub snippet: String,
914}
915
916/// A stored embedding entry — chunk metadata + vector
917#[derive(Debug)]
918struct EmbeddingEntry {
919    chunk: SemanticChunk,
920    vector: Vec<f32>,
921}
922
923/// The semantic index — stores embeddings for all symbols in a project
924#[derive(Debug)]
925pub struct SemanticIndex {
926    entries: Vec<EmbeddingEntry>,
927    /// Track which files are indexed and their mtime for staleness detection
928    file_mtimes: HashMap<PathBuf, SystemTime>,
929    /// Track indexed file sizes alongside mtimes for staleness detection
930    file_sizes: HashMap<PathBuf, u64>,
931    file_hashes: HashMap<PathBuf, blake3::Hash>,
932    /// Embedding dimension (384 for MiniLM-L6-v2)
933    dimension: usize,
934    fingerprint: Option<SemanticIndexFingerprint>,
935    project_root: PathBuf,
936}
937
938#[derive(Debug, Clone, Copy)]
939struct IndexedFileMetadata {
940    mtime: SystemTime,
941    size: u64,
942    content_hash: blake3::Hash,
943}
944
945/// Result of an incremental refresh of the semantic index. Counts are file
946/// counts; `total_processed` is the number of current/deleted files considered.
947#[derive(Debug, Default, Clone, Copy)]
948pub struct RefreshSummary {
949    pub changed: usize,
950    pub added: usize,
951    pub deleted: usize,
952    pub total_processed: usize,
953}
954
955impl RefreshSummary {
956    /// True when no files were touched.
957    pub fn is_noop(&self) -> bool {
958        self.changed == 0 && self.added == 0 && self.deleted == 0
959    }
960}
961
962/// Search result from a semantic query
963#[derive(Debug, Clone)]
964pub struct SemanticResult {
965    pub file: PathBuf,
966    pub name: String,
967    pub kind: SymbolKind,
968    pub start_line: u32,
969    pub end_line: u32,
970    pub exported: bool,
971    pub snippet: String,
972    pub score: f32,
973    pub source: &'static str,
974}
975
976impl SemanticIndex {
977    pub fn new(project_root: PathBuf, dimension: usize) -> Self {
978        debug_assert!(project_root.is_absolute());
979        Self {
980            entries: Vec::new(),
981            file_mtimes: HashMap::new(),
982            file_sizes: HashMap::new(),
983            file_hashes: HashMap::new(),
984            dimension,
985            fingerprint: None,
986            project_root,
987        }
988    }
989
990    /// Number of embedded symbol entries.
991    pub fn entry_count(&self) -> usize {
992        self.entries.len()
993    }
994
995    /// Human-readable status label for the index.
996    pub fn status_label(&self) -> &'static str {
997        if self.entries.is_empty() {
998            "empty"
999        } else {
1000            "ready"
1001        }
1002    }
1003
1004    fn collect_chunks(
1005        project_root: &Path,
1006        files: &[PathBuf],
1007    ) -> (Vec<SemanticChunk>, HashMap<PathBuf, IndexedFileMetadata>) {
1008        let per_file: Vec<(
1009            PathBuf,
1010            Result<(IndexedFileMetadata, Vec<SemanticChunk>), String>,
1011        )> = files
1012            .par_iter()
1013            .map_init(HashMap::new, |parsers, file| {
1014                let result = collect_file_metadata(file).and_then(|metadata| {
1015                    collect_file_chunks(project_root, file, parsers)
1016                        .map(|chunks| (metadata, chunks))
1017                });
1018                (file.clone(), result)
1019            })
1020            .collect();
1021
1022        let mut chunks: Vec<SemanticChunk> = Vec::new();
1023        let mut file_metadata: HashMap<PathBuf, IndexedFileMetadata> = HashMap::new();
1024
1025        for (file, result) in per_file {
1026            match result {
1027                Ok((metadata, file_chunks)) => {
1028                    file_metadata.insert(file, metadata);
1029                    chunks.extend(file_chunks);
1030                }
1031                Err(error) => {
1032                    // "unsupported file extension" is expected for non-code files
1033                    // (json, xml, .gitignore, etc.) that get included in the
1034                    // project walk. Pre-fix this was swallowed by .unwrap_or_default();
1035                    // we now skip silently to keep the log clean. Only real read/parse
1036                    // errors are worth surfacing.
1037                    if error == "unsupported file extension" {
1038                        continue;
1039                    }
1040                    slog_warn!(
1041                        "failed to collect semantic chunks for {}: {}",
1042                        file.display(),
1043                        error
1044                    );
1045                }
1046            }
1047        }
1048
1049        (chunks, file_metadata)
1050    }
1051
1052    fn build_from_chunks<F, P>(
1053        project_root: &Path,
1054        chunks: Vec<SemanticChunk>,
1055        file_metadata: HashMap<PathBuf, IndexedFileMetadata>,
1056        embed_fn: &mut F,
1057        max_batch_size: usize,
1058        mut progress: Option<&mut P>,
1059    ) -> Result<Self, String>
1060    where
1061        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1062        P: FnMut(usize, usize),
1063    {
1064        debug_assert!(project_root.is_absolute());
1065        let total_chunks = chunks.len();
1066
1067        if chunks.is_empty() {
1068            return Ok(Self {
1069                entries: Vec::new(),
1070                file_mtimes: file_metadata
1071                    .iter()
1072                    .map(|(path, metadata)| (path.clone(), metadata.mtime))
1073                    .collect(),
1074                file_sizes: file_metadata
1075                    .iter()
1076                    .map(|(path, metadata)| (path.clone(), metadata.size))
1077                    .collect(),
1078                file_hashes: file_metadata
1079                    .into_iter()
1080                    .map(|(path, metadata)| (path, metadata.content_hash))
1081                    .collect(),
1082                dimension: DEFAULT_DIMENSION,
1083                fingerprint: None,
1084                project_root: project_root.to_path_buf(),
1085            });
1086        }
1087
1088        // Embed in batches
1089        let mut entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1090        let mut expected_dimension: Option<usize> = None;
1091        let batch_size = max_batch_size.max(1);
1092        for batch_start in (0..chunks.len()).step_by(batch_size) {
1093            let batch_end = (batch_start + batch_size).min(chunks.len());
1094            let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1095                .iter()
1096                .map(|c| c.embed_text.clone())
1097                .collect();
1098
1099            let vectors = embed_fn(batch_texts)?;
1100            validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1101
1102            // Track consistent dimension across all batches
1103            if let Some(dim) = vectors.first().map(|v| v.len()) {
1104                match expected_dimension {
1105                    None => expected_dimension = Some(dim),
1106                    Some(expected) if dim != expected => {
1107                        return Err(format!(
1108                            "embedding dimension changed across batches: expected {expected}, got {dim}"
1109                        ));
1110                    }
1111                    _ => {}
1112                }
1113            }
1114
1115            for (i, vector) in vectors.into_iter().enumerate() {
1116                let chunk_idx = batch_start + i;
1117                entries.push(EmbeddingEntry {
1118                    chunk: chunks[chunk_idx].clone(),
1119                    vector,
1120                });
1121            }
1122
1123            if let Some(callback) = progress.as_mut() {
1124                callback(entries.len(), total_chunks);
1125            }
1126        }
1127
1128        let dimension = entries
1129            .first()
1130            .map(|e| e.vector.len())
1131            .unwrap_or(DEFAULT_DIMENSION);
1132
1133        Ok(Self {
1134            entries,
1135            file_mtimes: file_metadata
1136                .iter()
1137                .map(|(path, metadata)| (path.clone(), metadata.mtime))
1138                .collect(),
1139            file_sizes: file_metadata
1140                .iter()
1141                .map(|(path, metadata)| (path.clone(), metadata.size))
1142                .collect(),
1143            file_hashes: file_metadata
1144                .into_iter()
1145                .map(|(path, metadata)| (path, metadata.content_hash))
1146                .collect(),
1147            dimension,
1148            fingerprint: None,
1149            project_root: project_root.to_path_buf(),
1150        })
1151    }
1152
1153    /// Build the semantic index from a set of files using the provided embedding function.
1154    /// `embed_fn` takes a batch of texts and returns a batch of embedding vectors.
1155    pub fn build<F>(
1156        project_root: &Path,
1157        files: &[PathBuf],
1158        embed_fn: &mut F,
1159        max_batch_size: usize,
1160    ) -> Result<Self, String>
1161    where
1162        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1163    {
1164        let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1165        Self::build_from_chunks(
1166            project_root,
1167            chunks,
1168            file_mtimes,
1169            embed_fn,
1170            max_batch_size,
1171            Option::<&mut fn(usize, usize)>::None,
1172        )
1173    }
1174
1175    /// Build the semantic index and report embedding progress using entry counts.
1176    pub fn build_with_progress<F, P>(
1177        project_root: &Path,
1178        files: &[PathBuf],
1179        embed_fn: &mut F,
1180        max_batch_size: usize,
1181        progress: &mut P,
1182    ) -> Result<Self, String>
1183    where
1184        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1185        P: FnMut(usize, usize),
1186    {
1187        let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1188        let total_chunks = chunks.len();
1189        progress(0, total_chunks);
1190        Self::build_from_chunks(
1191            project_root,
1192            chunks,
1193            file_mtimes,
1194            embed_fn,
1195            max_batch_size,
1196            Some(progress),
1197        )
1198    }
1199
1200    /// Incrementally refresh entries for changed/new files only, preserving cached
1201    /// embeddings for unchanged files. Used when loading the index from disk and
1202    /// finding that a small fraction of files have moved on, deleted, or appeared.
1203    ///
1204    /// Returns `RefreshSummary` describing what changed. On success, `self` is
1205    /// mutated in place and remains a valid index.
1206    ///
1207    /// `current_files` is the full set of files the project considers indexable
1208    /// (typically `walk_project_files(...)`). Files in the cache that are no
1209    /// longer in this set are treated as deleted.
1210    pub fn refresh_stale_files<F, P>(
1211        &mut self,
1212        project_root: &Path,
1213        current_files: &[PathBuf],
1214        embed_fn: &mut F,
1215        max_batch_size: usize,
1216        progress: &mut P,
1217    ) -> Result<RefreshSummary, String>
1218    where
1219        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1220        P: FnMut(usize, usize),
1221    {
1222        self.backfill_missing_file_sizes();
1223
1224        // 1. Bucket files into deleted / changed / added.
1225        let current_set: HashSet<&Path> = current_files.iter().map(PathBuf::as_path).collect();
1226        let total_processed = current_set.len() + self.file_mtimes.len()
1227            - self
1228                .file_mtimes
1229                .keys()
1230                .filter(|path| current_set.contains(path.as_path()))
1231                .count();
1232
1233        // Files in cache that disappeared from disk OR are no longer in the
1234        // walked set. Both cases need their entries dropped.
1235        let mut deleted: Vec<PathBuf> = Vec::new();
1236        let mut changed: Vec<PathBuf> = Vec::new();
1237        let indexed_paths: Vec<PathBuf> = self.file_mtimes.keys().cloned().collect();
1238        for indexed_path in &indexed_paths {
1239            if !current_set.contains(indexed_path.as_path()) {
1240                deleted.push(indexed_path.clone());
1241                continue;
1242            }
1243            let cached = match (
1244                self.file_mtimes.get(indexed_path),
1245                self.file_sizes.get(indexed_path),
1246                self.file_hashes.get(indexed_path),
1247            ) {
1248                (Some(mtime), Some(size), Some(hash)) => Some(FileFreshness {
1249                    mtime: *mtime,
1250                    size: *size,
1251                    content_hash: *hash,
1252                }),
1253                _ => None,
1254            };
1255            match cached.map(|freshness| cache_freshness::verify_file(indexed_path, &freshness)) {
1256                Some(FreshnessVerdict::HotFresh) => {}
1257                Some(FreshnessVerdict::ContentFresh {
1258                    new_mtime,
1259                    new_size,
1260                }) => {
1261                    self.file_mtimes.insert(indexed_path.clone(), new_mtime);
1262                    self.file_sizes.insert(indexed_path.clone(), new_size);
1263                }
1264                Some(FreshnessVerdict::Stale | FreshnessVerdict::Deleted) | None => {
1265                    changed.push(indexed_path.clone());
1266                }
1267            }
1268        }
1269
1270        // Files in walk that were never indexed.
1271        let mut added: Vec<PathBuf> = Vec::new();
1272        for path in current_files {
1273            if !self.file_mtimes.contains_key(path) {
1274                added.push(path.clone());
1275            }
1276        }
1277
1278        // Fast path: nothing to do.
1279        if deleted.is_empty() && changed.is_empty() && added.is_empty() {
1280            progress(0, 0);
1281            return Ok(RefreshSummary {
1282                total_processed,
1283                ..RefreshSummary::default()
1284            });
1285        }
1286
1287        // 2. Drop entries for deleted files immediately. Changed files are only
1288        //    replaced after successful re-extraction + embedding so transient
1289        //    read/parse errors keep the stale-but-valid cache entry.
1290        if !deleted.is_empty() {
1291            let deleted_set: HashSet<&Path> = deleted.iter().map(PathBuf::as_path).collect();
1292            self.entries
1293                .retain(|entry| !deleted_set.contains(entry.chunk.file.as_path()));
1294            for path in &deleted {
1295                self.file_mtimes.remove(path);
1296                self.file_sizes.remove(path);
1297                self.file_hashes.remove(path);
1298            }
1299        }
1300
1301        // 3. Embed the changed + added set, if any.
1302        let mut to_embed: Vec<PathBuf> = Vec::with_capacity(changed.len() + added.len());
1303        to_embed.extend(changed.iter().cloned());
1304        to_embed.extend(added.iter().cloned());
1305
1306        if to_embed.is_empty() {
1307            // Only deletions happened.
1308            progress(0, 0);
1309            return Ok(RefreshSummary {
1310                changed: 0,
1311                added: 0,
1312                deleted: deleted.len(),
1313                total_processed,
1314            });
1315        }
1316
1317        let (chunks, fresh_metadata) = Self::collect_chunks(project_root, &to_embed);
1318
1319        if chunks.is_empty() {
1320            progress(0, 0);
1321            let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1322            if !successful_files.is_empty() {
1323                self.entries
1324                    .retain(|entry| !successful_files.contains(&entry.chunk.file));
1325            }
1326            let changed_count = changed
1327                .iter()
1328                .filter(|path| successful_files.contains(*path))
1329                .count();
1330            let added_count = added
1331                .iter()
1332                .filter(|path| successful_files.contains(*path))
1333                .count();
1334            for (file, metadata) in fresh_metadata {
1335                self.file_mtimes.insert(file.clone(), metadata.mtime);
1336                self.file_sizes.insert(file.clone(), metadata.size);
1337                self.file_hashes.insert(file.clone(), metadata.content_hash);
1338            }
1339            return Ok(RefreshSummary {
1340                changed: changed_count,
1341                added: added_count,
1342                deleted: deleted.len(),
1343                total_processed,
1344            });
1345        }
1346
1347        // 4. Embed in batches and dimension-check against the existing index.
1348        let total_chunks = chunks.len();
1349        progress(0, total_chunks);
1350        let batch_size = max_batch_size.max(1);
1351        let existing_dimension = if self.entries.is_empty() {
1352            None
1353        } else {
1354            Some(self.dimension)
1355        };
1356        let mut new_entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1357        let mut observed_dimension: Option<usize> = existing_dimension;
1358
1359        for batch_start in (0..chunks.len()).step_by(batch_size) {
1360            let batch_end = (batch_start + batch_size).min(chunks.len());
1361            let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1362                .iter()
1363                .map(|c| c.embed_text.clone())
1364                .collect();
1365
1366            let vectors = embed_fn(batch_texts)?;
1367            validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1368
1369            if let Some(dim) = vectors.first().map(|v| v.len()) {
1370                match observed_dimension {
1371                    None => observed_dimension = Some(dim),
1372                    Some(expected) if dim != expected => {
1373                        // Refuse to mix dimensions in one index. Caller should
1374                        // fall back to a full rebuild.
1375                        return Err(format!(
1376                            "embedding dimension changed during incremental refresh: \
1377                             cached index uses {expected}, new vectors use {dim}"
1378                        ));
1379                    }
1380                    _ => {}
1381                }
1382            }
1383
1384            for (i, vector) in vectors.into_iter().enumerate() {
1385                let chunk_idx = batch_start + i;
1386                new_entries.push(EmbeddingEntry {
1387                    chunk: chunks[chunk_idx].clone(),
1388                    vector,
1389                });
1390            }
1391
1392            progress(new_entries.len(), total_chunks);
1393        }
1394
1395        let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1396        if !successful_files.is_empty() {
1397            self.entries
1398                .retain(|entry| !successful_files.contains(&entry.chunk.file));
1399        }
1400
1401        self.entries.extend(new_entries);
1402        for (file, metadata) in fresh_metadata {
1403            self.file_mtimes.insert(file.clone(), metadata.mtime);
1404            self.file_sizes.insert(file.clone(), metadata.size);
1405            self.file_hashes.insert(file, metadata.content_hash);
1406        }
1407        if let Some(dim) = observed_dimension {
1408            self.dimension = dim;
1409        }
1410
1411        Ok(RefreshSummary {
1412            changed: changed
1413                .iter()
1414                .filter(|path| successful_files.contains(*path))
1415                .count(),
1416            added: added
1417                .iter()
1418                .filter(|path| successful_files.contains(*path))
1419                .count(),
1420            deleted: deleted.len(),
1421            total_processed,
1422        })
1423    }
1424
1425    /// Search the index with a query embedding, returning top-K results sorted by relevance
1426    pub fn search(&self, query_vector: &[f32], top_k: usize) -> Vec<SemanticResult> {
1427        if self.entries.is_empty() || query_vector.len() != self.dimension {
1428            return Vec::new();
1429        }
1430
1431        let mut scored: Vec<(f32, usize)> = self
1432            .entries
1433            .iter()
1434            .enumerate()
1435            .map(|(i, entry)| {
1436                let mut score = cosine_similarity(query_vector, &entry.vector);
1437                if entry.chunk.exported {
1438                    score *= 1.1;
1439                }
1440                (score, i)
1441            })
1442            .collect();
1443
1444        // Sort descending by score
1445        scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
1446
1447        scored
1448            .into_iter()
1449            .take(top_k)
1450            // Keep the sort → take → map ordering explicit: removing the old
1451            // `> 0.0` floor cannot evict positive hits because top_k has already
1452            // been selected, but it can surface zero-score noise in the tail.
1453            .map(|(score, idx)| {
1454                let entry = &self.entries[idx];
1455                SemanticResult {
1456                    file: entry.chunk.file.clone(),
1457                    name: entry.chunk.name.clone(),
1458                    kind: entry.chunk.kind.clone(),
1459                    start_line: entry.chunk.start_line,
1460                    end_line: entry.chunk.end_line,
1461                    exported: entry.chunk.exported,
1462                    snippet: entry.chunk.snippet.clone(),
1463                    score,
1464                    source: "semantic",
1465                }
1466            })
1467            .collect()
1468    }
1469
1470    /// Number of indexed entries
1471    pub fn len(&self) -> usize {
1472        self.entries.len()
1473    }
1474
1475    /// Check if a file needs re-indexing based on mtime/size
1476    pub fn is_file_stale(&self, file: &Path) -> bool {
1477        let Some(stored_mtime) = self.file_mtimes.get(file) else {
1478            return true;
1479        };
1480        let Some(stored_size) = self.file_sizes.get(file) else {
1481            return true;
1482        };
1483        let Some(stored_hash) = self.file_hashes.get(file) else {
1484            return true;
1485        };
1486        let cached = FileFreshness {
1487            mtime: *stored_mtime,
1488            size: *stored_size,
1489            content_hash: *stored_hash,
1490        };
1491        match cache_freshness::verify_file(file, &cached) {
1492            FreshnessVerdict::HotFresh => false,
1493            FreshnessVerdict::ContentFresh { .. } => false,
1494            FreshnessVerdict::Stale | FreshnessVerdict::Deleted => true,
1495        }
1496    }
1497
1498    fn backfill_missing_file_sizes(&mut self) {
1499        for path in self.file_mtimes.keys() {
1500            if self.file_sizes.contains_key(path) {
1501                continue;
1502            }
1503            if let Ok(metadata) = fs::metadata(path) {
1504                self.file_sizes.insert(path.clone(), metadata.len());
1505                if let Ok(Some(hash)) = cache_freshness::hash_file_if_small(path, metadata.len()) {
1506                    self.file_hashes.insert(path.clone(), hash);
1507                }
1508            }
1509        }
1510    }
1511
1512    /// Remove entries for a specific file
1513    pub fn remove_file(&mut self, file: &Path) {
1514        self.invalidate_file(file);
1515    }
1516
1517    pub fn invalidate_file(&mut self, file: &Path) {
1518        self.entries.retain(|e| e.chunk.file != file);
1519        self.file_mtimes.remove(file);
1520        self.file_sizes.remove(file);
1521        self.file_hashes.remove(file);
1522    }
1523
1524    /// Get the embedding dimension
1525    pub fn dimension(&self) -> usize {
1526        self.dimension
1527    }
1528
1529    pub fn fingerprint(&self) -> Option<&SemanticIndexFingerprint> {
1530        self.fingerprint.as_ref()
1531    }
1532
1533    pub fn backend_label(&self) -> Option<&str> {
1534        self.fingerprint.as_ref().map(|f| f.backend.as_str())
1535    }
1536
1537    pub fn model_label(&self) -> Option<&str> {
1538        self.fingerprint.as_ref().map(|f| f.model.as_str())
1539    }
1540
1541    pub fn set_fingerprint(&mut self, fingerprint: SemanticIndexFingerprint) {
1542        self.fingerprint = Some(fingerprint);
1543    }
1544
1545    /// Write the semantic index to disk using atomic temp+rename pattern
1546    pub fn write_to_disk(&self, storage_dir: &Path, project_key: &str) {
1547        // Don't persist empty indexes — they would be loaded on next startup
1548        // and prevent a fresh build that might find files.
1549        if self.entries.is_empty() {
1550            slog_info!("skipping semantic index persistence (0 entries)");
1551            return;
1552        }
1553        let dir = storage_dir.join("semantic").join(project_key);
1554        if let Err(e) = fs::create_dir_all(&dir) {
1555            slog_warn!("failed to create semantic cache dir: {}", e);
1556            return;
1557        }
1558        let data_path = dir.join("semantic.bin");
1559        let tmp_path = dir.join(format!(
1560            "semantic.bin.tmp.{}.{}",
1561            std::process::id(),
1562            SystemTime::now()
1563                .duration_since(SystemTime::UNIX_EPOCH)
1564                .unwrap_or(Duration::ZERO)
1565                .as_nanos()
1566        ));
1567        let bytes = self.to_bytes();
1568        let write_result = (|| -> std::io::Result<()> {
1569            use std::io::Write;
1570            let mut file = fs::File::create(&tmp_path)?;
1571            file.write_all(&bytes)?;
1572            file.sync_all()?;
1573            Ok(())
1574        })();
1575        if let Err(e) = write_result {
1576            slog_warn!("failed to write semantic index: {}", e);
1577            let _ = fs::remove_file(&tmp_path);
1578            return;
1579        }
1580        if let Err(e) = fs::rename(&tmp_path, &data_path) {
1581            slog_warn!("failed to rename semantic index: {}", e);
1582            let _ = fs::remove_file(&tmp_path);
1583            return;
1584        }
1585        slog_info!(
1586            "semantic index persisted: {} entries, {:.1} KB",
1587            self.entries.len(),
1588            bytes.len() as f64 / 1024.0
1589        );
1590    }
1591
1592    /// Read the semantic index from disk
1593    pub fn read_from_disk(
1594        storage_dir: &Path,
1595        project_key: &str,
1596        current_canonical_root: &Path,
1597        is_worktree_bridge: bool,
1598        expected_fingerprint: Option<&str>,
1599    ) -> Option<Self> {
1600        debug_assert!(current_canonical_root.is_absolute());
1601        let data_path = storage_dir
1602            .join("semantic")
1603            .join(project_key)
1604            .join("semantic.bin");
1605        let file_len = usize::try_from(fs::metadata(&data_path).ok()?.len()).ok()?;
1606        if file_len < HEADER_BYTES_V1 {
1607            slog_warn!(
1608                "corrupt semantic index (too small: {} bytes), removing",
1609                file_len
1610            );
1611            if !is_worktree_bridge {
1612                let _ = fs::remove_file(&data_path);
1613            }
1614            return None;
1615        }
1616
1617        let bytes = fs::read(&data_path).ok()?;
1618        let version = bytes[0];
1619        if version != SEMANTIC_INDEX_VERSION_V6 {
1620            slog_info!(
1621                "cached semantic index version {} is older than {}, rebuilding",
1622                version,
1623                SEMANTIC_INDEX_VERSION_V6
1624            );
1625            if !is_worktree_bridge {
1626                let _ = fs::remove_file(&data_path);
1627            }
1628            return None;
1629        }
1630        match Self::from_bytes(&bytes, current_canonical_root) {
1631            Ok(index) => {
1632                if index.entries.is_empty() {
1633                    slog_info!("cached semantic index is empty, will rebuild");
1634                    if !is_worktree_bridge {
1635                        let _ = fs::remove_file(&data_path);
1636                    }
1637                    return None;
1638                }
1639                if let Some(expected) = expected_fingerprint {
1640                    let matches = index
1641                        .fingerprint()
1642                        .map(|fingerprint| fingerprint.matches_expected(expected))
1643                        .unwrap_or(false);
1644                    if !matches {
1645                        slog_info!("cached semantic index fingerprint mismatch, rebuilding");
1646                        if !is_worktree_bridge {
1647                            let _ = fs::remove_file(&data_path);
1648                        }
1649                        return None;
1650                    }
1651                }
1652                slog_info!(
1653                    "loaded semantic index from disk: {} entries",
1654                    index.entries.len()
1655                );
1656                Some(index)
1657            }
1658            Err(e) => {
1659                slog_warn!("corrupt semantic index, rebuilding: {}", e);
1660                if !is_worktree_bridge {
1661                    let _ = fs::remove_file(&data_path);
1662                }
1663                None
1664            }
1665        }
1666    }
1667
1668    /// Serialize the index to bytes for disk persistence
1669    pub fn to_bytes(&self) -> Vec<u8> {
1670        let mut buf = Vec::new();
1671        let fingerprint_bytes = self.fingerprint.as_ref().and_then(|fingerprint| {
1672            let encoded = fingerprint.as_string();
1673            if encoded.is_empty() {
1674                None
1675            } else {
1676                Some(encoded.into_bytes())
1677            }
1678        });
1679        let file_mtimes: Vec<_> = self
1680            .file_mtimes
1681            .iter()
1682            .filter_map(|(path, mtime)| {
1683                cache_relative_path(&self.project_root, path)
1684                    .map(|relative| (relative, path, mtime))
1685            })
1686            .collect();
1687        let entries: Vec<_> = self
1688            .entries
1689            .iter()
1690            .filter_map(|entry| {
1691                cache_relative_path(&self.project_root, &entry.chunk.file)
1692                    .map(|relative| (relative, entry))
1693            })
1694            .collect();
1695
1696        // Header: version(1) + dimension(4) + entry_count(4) + fingerprint_len(4) + fingerprint
1697        //
1698        // V6 is the single write format. Layout extends V5:
1699        //   - fingerprint is always represented (absent ⇒ fingerprint_len=0,
1700        //     no bytes follow). Uniform format simplifies the reader.
1701        //   - paths are relative to project_root.
1702        //   - file metadata stored as secs(u64) + subsec_nanos(u32) + size(u64) + blake3(32).
1703        //     Preserves full APFS/ext4/NTFS precision and catches mtime ties.
1704        //
1705        // V1/V2 remain readable for backward compatibility (see from_bytes).
1706        // V3/V4 load as compatible formats but are rejected on disk so snippets
1707        // and file sizes are rebuilt once.
1708        let version = SEMANTIC_INDEX_VERSION_V6;
1709        buf.push(version);
1710        buf.extend_from_slice(&(self.dimension as u32).to_le_bytes());
1711        buf.extend_from_slice(&(entries.len() as u32).to_le_bytes());
1712        let fp_bytes_ref: &[u8] = fingerprint_bytes.as_deref().unwrap_or(&[]);
1713        buf.extend_from_slice(&(fp_bytes_ref.len() as u32).to_le_bytes());
1714        buf.extend_from_slice(fp_bytes_ref);
1715
1716        // File mtime table: count(4) + entries
1717        // V3 layout per entry: path_len(4) + path + secs(8) + subsec_nanos(4)
1718        buf.extend_from_slice(&(file_mtimes.len() as u32).to_le_bytes());
1719        for (relative, path, mtime) in &file_mtimes {
1720            let path_bytes = relative.to_string_lossy().as_bytes().to_vec();
1721            buf.extend_from_slice(&(path_bytes.len() as u32).to_le_bytes());
1722            buf.extend_from_slice(&path_bytes);
1723            let duration = mtime
1724                .duration_since(SystemTime::UNIX_EPOCH)
1725                .unwrap_or_default();
1726            buf.extend_from_slice(&duration.as_secs().to_le_bytes());
1727            buf.extend_from_slice(&duration.subsec_nanos().to_le_bytes());
1728            let size = self.file_sizes.get(*path).copied().unwrap_or_default();
1729            buf.extend_from_slice(&size.to_le_bytes());
1730            let hash = self
1731                .file_hashes
1732                .get(*path)
1733                .copied()
1734                .unwrap_or_else(cache_freshness::zero_hash);
1735            buf.extend_from_slice(hash.as_bytes());
1736        }
1737
1738        // Entries: each is metadata + vector
1739        for (relative, entry) in &entries {
1740            let c = &entry.chunk;
1741
1742            // File path
1743            let file_bytes = relative.to_string_lossy().as_bytes().to_vec();
1744            buf.extend_from_slice(&(file_bytes.len() as u32).to_le_bytes());
1745            buf.extend_from_slice(&file_bytes);
1746
1747            // Name
1748            let name_bytes = c.name.as_bytes();
1749            buf.extend_from_slice(&(name_bytes.len() as u32).to_le_bytes());
1750            buf.extend_from_slice(name_bytes);
1751
1752            // Kind (1 byte)
1753            buf.push(symbol_kind_to_u8(&c.kind));
1754
1755            // Lines + exported
1756            buf.extend_from_slice(&(c.start_line as u32).to_le_bytes());
1757            buf.extend_from_slice(&(c.end_line as u32).to_le_bytes());
1758            buf.push(c.exported as u8);
1759
1760            // Snippet
1761            let snippet_bytes = c.snippet.as_bytes();
1762            buf.extend_from_slice(&(snippet_bytes.len() as u32).to_le_bytes());
1763            buf.extend_from_slice(snippet_bytes);
1764
1765            // Embed text
1766            let embed_bytes = c.embed_text.as_bytes();
1767            buf.extend_from_slice(&(embed_bytes.len() as u32).to_le_bytes());
1768            buf.extend_from_slice(embed_bytes);
1769
1770            // Vector (f32 array)
1771            for &val in &entry.vector {
1772                buf.extend_from_slice(&val.to_le_bytes());
1773            }
1774        }
1775
1776        buf
1777    }
1778
1779    /// Deserialize the index from bytes
1780    pub fn from_bytes(data: &[u8], current_canonical_root: &Path) -> Result<Self, String> {
1781        debug_assert!(current_canonical_root.is_absolute());
1782        let mut pos = 0;
1783
1784        if data.len() < HEADER_BYTES_V1 {
1785            return Err("data too short".to_string());
1786        }
1787
1788        let version = data[pos];
1789        pos += 1;
1790        if version != SEMANTIC_INDEX_VERSION_V1
1791            && version != SEMANTIC_INDEX_VERSION_V2
1792            && version != SEMANTIC_INDEX_VERSION_V3
1793            && version != SEMANTIC_INDEX_VERSION_V4
1794            && version != SEMANTIC_INDEX_VERSION_V5
1795            && version != SEMANTIC_INDEX_VERSION_V6
1796        {
1797            return Err(format!("unsupported version: {}", version));
1798        }
1799        // V2 and newer share the same header layout (V3/V4/V5 only differ from
1800        // V2 in the per-mtime entry layout): version(1) + dimension(4) +
1801        // entry_count(4) + fingerprint_len(4) + fingerprint bytes.
1802        if (version == SEMANTIC_INDEX_VERSION_V2
1803            || version == SEMANTIC_INDEX_VERSION_V3
1804            || version == SEMANTIC_INDEX_VERSION_V4
1805            || version == SEMANTIC_INDEX_VERSION_V5
1806            || version == SEMANTIC_INDEX_VERSION_V6)
1807            && data.len() < HEADER_BYTES_V2
1808        {
1809            return Err("data too short for semantic index v2/v3/v4/v5/v6 header".to_string());
1810        }
1811
1812        let dimension = read_u32(data, &mut pos)? as usize;
1813        let entry_count = read_u32(data, &mut pos)? as usize;
1814        if dimension == 0 || dimension > MAX_DIMENSION {
1815            return Err(format!("invalid embedding dimension: {}", dimension));
1816        }
1817        if entry_count > MAX_ENTRIES {
1818            return Err(format!("too many semantic index entries: {}", entry_count));
1819        }
1820
1821        // Fingerprint handling:
1822        //   - V1: no fingerprint field at all.
1823        //   - V2: fingerprint_len + fingerprint bytes; always present (writer
1824        //     only emitted V2 when fingerprint was Some).
1825        //   - V3+: fingerprint_len always present; fingerprint_len==0 ⇒ None.
1826        let has_fingerprint_field = version == SEMANTIC_INDEX_VERSION_V2
1827            || version == SEMANTIC_INDEX_VERSION_V3
1828            || version == SEMANTIC_INDEX_VERSION_V4
1829            || version == SEMANTIC_INDEX_VERSION_V5
1830            || version == SEMANTIC_INDEX_VERSION_V6;
1831        let fingerprint = if has_fingerprint_field {
1832            let fingerprint_len = read_u32(data, &mut pos)? as usize;
1833            if pos + fingerprint_len > data.len() {
1834                return Err("unexpected end of data reading fingerprint".to_string());
1835            }
1836            if fingerprint_len == 0 {
1837                None
1838            } else {
1839                let raw = String::from_utf8_lossy(&data[pos..pos + fingerprint_len]).to_string();
1840                pos += fingerprint_len;
1841                Some(
1842                    serde_json::from_str::<SemanticIndexFingerprint>(&raw)
1843                        .map_err(|error| format!("invalid semantic fingerprint: {error}"))?,
1844                )
1845            }
1846        } else {
1847            None
1848        };
1849
1850        // File mtimes
1851        let mtime_count = read_u32(data, &mut pos)? as usize;
1852        if mtime_count > MAX_ENTRIES {
1853            return Err(format!("too many semantic file mtimes: {}", mtime_count));
1854        }
1855
1856        let vector_bytes = entry_count
1857            .checked_mul(dimension)
1858            .and_then(|count| count.checked_mul(F32_BYTES))
1859            .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
1860        if vector_bytes > data.len().saturating_sub(pos) {
1861            return Err("semantic index vectors exceed available data".to_string());
1862        }
1863
1864        let mut file_mtimes = HashMap::with_capacity(mtime_count);
1865        let mut file_sizes = HashMap::with_capacity(mtime_count);
1866        let mut file_hashes = HashMap::with_capacity(mtime_count);
1867        for _ in 0..mtime_count {
1868            let path = read_string(data, &mut pos)?;
1869            let secs = read_u64(data, &mut pos)?;
1870            // V3+ persists subsec_nanos alongside secs so staleness checks
1871            // survive restart round-trips. V1/V2 load with 0 nanos, which
1872            // causes one rebuild on upgrade (they never matched live APFS
1873            // mtimes anyway — the bug v0.15.2 fixes). After that rebuild,
1874            // the cache is persisted as V3 and stabilises.
1875            let nanos = if version == SEMANTIC_INDEX_VERSION_V3
1876                || version == SEMANTIC_INDEX_VERSION_V4
1877                || version == SEMANTIC_INDEX_VERSION_V5
1878                || version == SEMANTIC_INDEX_VERSION_V6
1879            {
1880                read_u32(data, &mut pos)?
1881            } else {
1882                0
1883            };
1884            let size =
1885                if version == SEMANTIC_INDEX_VERSION_V5 || version == SEMANTIC_INDEX_VERSION_V6 {
1886                    read_u64(data, &mut pos)?
1887                } else {
1888                    0
1889                };
1890            let content_hash = if version == SEMANTIC_INDEX_VERSION_V6 {
1891                if pos + 32 > data.len() {
1892                    return Err("unexpected end of data reading content hash".to_string());
1893                }
1894                let mut hash_bytes = [0u8; 32];
1895                hash_bytes.copy_from_slice(&data[pos..pos + 32]);
1896                pos += 32;
1897                blake3::Hash::from_bytes(hash_bytes)
1898            } else {
1899                cache_freshness::zero_hash()
1900            };
1901            // Hardening against corrupt / maliciously crafted cache files
1902            // (v0.15.2). `Duration::new(secs, nanos)` can panic when the
1903            // nanosecond carry overflows the second counter, and
1904            // `SystemTime + Duration` can panic on carry past the platform's
1905            // upper bound. Explicit validation keeps a corrupted semantic.bin
1906            // from taking down the whole aft process.
1907            if nanos >= 1_000_000_000 {
1908                return Err(format!(
1909                    "invalid semantic mtime: nanos {} >= 1_000_000_000",
1910                    nanos
1911                ));
1912            }
1913            let duration = std::time::Duration::new(secs, nanos);
1914            let mtime = SystemTime::UNIX_EPOCH
1915                .checked_add(duration)
1916                .ok_or_else(|| {
1917                    format!(
1918                        "invalid semantic mtime: secs={} nanos={} overflows SystemTime",
1919                        secs, nanos
1920                    )
1921                })?;
1922            let path = if version == SEMANTIC_INDEX_VERSION_V6 {
1923                cached_path_under_root(current_canonical_root, &PathBuf::from(path))
1924                    .ok_or_else(|| "cached semantic mtime path escapes project root".to_string())?
1925            } else {
1926                PathBuf::from(path)
1927            };
1928            file_mtimes.insert(path.clone(), mtime);
1929            file_sizes.insert(path.clone(), size);
1930            file_hashes.insert(path, content_hash);
1931        }
1932
1933        // Entries
1934        let mut entries = Vec::with_capacity(entry_count);
1935        for _ in 0..entry_count {
1936            let raw_file = PathBuf::from(read_string(data, &mut pos)?);
1937            let file = if version == SEMANTIC_INDEX_VERSION_V6 {
1938                cached_path_under_root(current_canonical_root, &raw_file)
1939                    .ok_or_else(|| "cached semantic entry path escapes project root".to_string())?
1940            } else {
1941                raw_file
1942            };
1943            let name = read_string(data, &mut pos)?;
1944
1945            if pos >= data.len() {
1946                return Err("unexpected end of data".to_string());
1947            }
1948            let kind = u8_to_symbol_kind(data[pos]);
1949            pos += 1;
1950
1951            let start_line = read_u32(data, &mut pos)?;
1952            let end_line = read_u32(data, &mut pos)?;
1953
1954            if pos >= data.len() {
1955                return Err("unexpected end of data".to_string());
1956            }
1957            let exported = data[pos] != 0;
1958            pos += 1;
1959
1960            let snippet = read_string(data, &mut pos)?;
1961            let embed_text = read_string(data, &mut pos)?;
1962
1963            // Vector
1964            let vec_bytes = dimension
1965                .checked_mul(F32_BYTES)
1966                .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
1967            if pos + vec_bytes > data.len() {
1968                return Err("unexpected end of data reading vector".to_string());
1969            }
1970            let mut vector = Vec::with_capacity(dimension);
1971            for _ in 0..dimension {
1972                let bytes = [data[pos], data[pos + 1], data[pos + 2], data[pos + 3]];
1973                vector.push(f32::from_le_bytes(bytes));
1974                pos += 4;
1975            }
1976
1977            entries.push(EmbeddingEntry {
1978                chunk: SemanticChunk {
1979                    file,
1980                    name,
1981                    kind,
1982                    start_line,
1983                    end_line,
1984                    exported,
1985                    embed_text,
1986                    snippet,
1987                },
1988                vector,
1989            });
1990        }
1991
1992        if entries.len() != entry_count {
1993            return Err(format!(
1994                "semantic cache entry count drift: header={} decoded={}",
1995                entry_count,
1996                entries.len()
1997            ));
1998        }
1999        for entry in &entries {
2000            if !file_mtimes.contains_key(&entry.chunk.file) {
2001                return Err(format!(
2002                    "semantic cache metadata missing for entry file {}",
2003                    entry.chunk.file.display()
2004                ));
2005            }
2006        }
2007
2008        Ok(Self {
2009            entries,
2010            file_mtimes,
2011            file_sizes,
2012            file_hashes,
2013            dimension,
2014            fingerprint,
2015            project_root: current_canonical_root.to_path_buf(),
2016        })
2017    }
2018}
2019
2020/// Build enriched embedding text from a symbol with cAST-style context
2021fn build_embed_text(symbol: &Symbol, source: &str, file: &Path, project_root: &Path) -> String {
2022    let relative = file
2023        .strip_prefix(project_root)
2024        .unwrap_or(file)
2025        .to_string_lossy();
2026
2027    let kind_label = match &symbol.kind {
2028        SymbolKind::Function => "function",
2029        SymbolKind::Class => "class",
2030        SymbolKind::Method => "method",
2031        SymbolKind::Struct => "struct",
2032        SymbolKind::Interface => "interface",
2033        SymbolKind::Enum => "enum",
2034        SymbolKind::TypeAlias => "type",
2035        SymbolKind::Variable => "variable",
2036        SymbolKind::Heading => "heading",
2037        SymbolKind::FileSummary => "file-summary",
2038    };
2039
2040    // Build: "file:relative/path kind:function name:validateAuth signature:fn validateAuth(token: &str) -> bool"
2041    let name = &symbol.name;
2042    let mut text = format!(
2043        "name:{name} file:{} kind:{} name:{name}",
2044        relative, kind_label
2045    );
2046
2047    if let Some(sig) = &symbol.signature {
2048        text.push_str(&format!(" signature:{}", sig));
2049    }
2050
2051    // Add body snippet (first ~300 chars of symbol body)
2052    let lines: Vec<&str> = source.lines().collect();
2053    let start = (symbol.range.start_line as usize).min(lines.len());
2054    // range.end_line is inclusive 0-based; +1 makes it an exclusive slice bound.
2055    let end = (symbol.range.end_line as usize + 1).min(lines.len());
2056    if start < end {
2057        let body: String = lines[start..end]
2058            .iter()
2059            .take(15) // max 15 lines
2060            .copied()
2061            .collect::<Vec<&str>>()
2062            .join("\n");
2063        let snippet = if body.len() > 300 {
2064            format!("{}...", &body[..body.floor_char_boundary(300)])
2065        } else {
2066            body
2067        };
2068        text.push_str(&format!(" body:{}", snippet));
2069    }
2070
2071    text
2072}
2073
2074fn truncate_chars(value: &str, max_chars: usize) -> String {
2075    value.chars().take(max_chars).collect()
2076}
2077
2078fn first_leading_doc_comment(source: &str) -> String {
2079    let lines: Vec<&str> = source.lines().collect();
2080    let Some((start, first)) = lines
2081        .iter()
2082        .enumerate()
2083        .find(|(_, line)| !line.trim().is_empty())
2084    else {
2085        return String::new();
2086    };
2087
2088    let trimmed = first.trim_start();
2089    if trimmed.starts_with("/**") {
2090        let mut comment = Vec::new();
2091        for line in lines.iter().skip(start) {
2092            comment.push(*line);
2093            if line.contains("*/") {
2094                break;
2095            }
2096        }
2097        return truncate_chars(&comment.join("\n"), 200);
2098    }
2099
2100    if trimmed.starts_with("///") || trimmed.starts_with("//!") {
2101        let comment = lines
2102            .iter()
2103            .skip(start)
2104            .take_while(|line| {
2105                let trimmed = line.trim_start();
2106                trimmed.starts_with("///") || trimmed.starts_with("//!")
2107            })
2108            .copied()
2109            .collect::<Vec<_>>()
2110            .join("\n");
2111        return truncate_chars(&comment, 200);
2112    }
2113
2114    String::new()
2115}
2116
2117pub fn build_file_summary_chunk(
2118    file: &Path,
2119    project_root: &Path,
2120    source: &str,
2121    top_exports: &[&str],
2122    top_export_signatures: &[Option<&str>],
2123) -> SemanticChunk {
2124    let relative = file.strip_prefix(project_root).unwrap_or(file);
2125    let rel_path = relative.to_string_lossy();
2126    let parent_dir = relative
2127        .parent()
2128        .map(|parent| parent.to_string_lossy().to_string())
2129        .unwrap_or_default();
2130    let name = file
2131        .file_stem()
2132        .map(|stem| stem.to_string_lossy().to_string())
2133        .unwrap_or_default();
2134    let doc = first_leading_doc_comment(source);
2135    let exports = top_exports
2136        .iter()
2137        .take(5)
2138        .copied()
2139        .collect::<Vec<_>>()
2140        .join(",");
2141    let snippet = if doc.is_empty() {
2142        top_export_signatures
2143            .first()
2144            .and_then(|signature| signature.as_deref())
2145            .map(|signature| truncate_chars(signature, 200))
2146            .unwrap_or_default()
2147    } else {
2148        doc.clone()
2149    };
2150
2151    SemanticChunk {
2152        file: file.to_path_buf(),
2153        name,
2154        kind: SymbolKind::FileSummary,
2155        start_line: 0,
2156        end_line: 0,
2157        exported: false,
2158        embed_text: format!(
2159            "file:{rel_path} kind:file-summary name:{} parent:{parent_dir} doc:{doc} exports:{exports}",
2160            file.file_stem()
2161                .map(|stem| stem.to_string_lossy().to_string())
2162                .unwrap_or_default()
2163        ),
2164        snippet,
2165    }
2166}
2167
2168fn parser_for(
2169    parsers: &mut HashMap<crate::parser::LangId, Parser>,
2170    lang: crate::parser::LangId,
2171) -> Result<&mut Parser, String> {
2172    use std::collections::hash_map::Entry;
2173
2174    match parsers.entry(lang) {
2175        Entry::Occupied(entry) => Ok(entry.into_mut()),
2176        Entry::Vacant(entry) => {
2177            let grammar = grammar_for(lang);
2178            let mut parser = Parser::new();
2179            parser
2180                .set_language(&grammar)
2181                .map_err(|error| error.to_string())?;
2182            Ok(entry.insert(parser))
2183        }
2184    }
2185}
2186
2187pub fn is_semantic_indexed_extension(path: &Path) -> bool {
2188    matches!(
2189        path.extension().and_then(|extension| extension.to_str()),
2190        Some(
2191            "ts" | "tsx"
2192                | "js"
2193                | "jsx"
2194                | "py"
2195                | "rs"
2196                | "go"
2197                | "c"
2198                | "h"
2199                | "cc"
2200                | "cpp"
2201                | "cxx"
2202                | "hpp"
2203                | "hh"
2204                | "zig"
2205                | "cs"
2206                | "sh"
2207                | "bash"
2208                | "zsh"
2209                | "sol"
2210                | "vue"
2211        )
2212    )
2213}
2214
2215fn collect_file_metadata(file: &Path) -> Result<IndexedFileMetadata, String> {
2216    let metadata = fs::metadata(file).map_err(|error| error.to_string())?;
2217    let mtime = metadata.modified().map_err(|error| error.to_string())?;
2218    let content_hash = cache_freshness::hash_file_if_small(file, metadata.len())
2219        .map_err(|error| error.to_string())?
2220        .unwrap_or_else(cache_freshness::zero_hash);
2221    Ok(IndexedFileMetadata {
2222        mtime,
2223        size: metadata.len(),
2224        content_hash,
2225    })
2226}
2227
2228fn collect_file_chunks(
2229    project_root: &Path,
2230    file: &Path,
2231    parsers: &mut HashMap<crate::parser::LangId, Parser>,
2232) -> Result<Vec<SemanticChunk>, String> {
2233    if !is_semantic_indexed_extension(file) {
2234        return Err("unsupported file extension".to_string());
2235    }
2236    let lang = detect_language(file).ok_or_else(|| "unsupported file extension".to_string())?;
2237    let source = std::fs::read_to_string(file).map_err(|error| error.to_string())?;
2238    let tree = parser_for(parsers, lang)?
2239        .parse(&source, None)
2240        .ok_or_else(|| format!("tree-sitter parse returned None for {}", file.display()))?;
2241    let symbols =
2242        extract_symbols_from_tree(&source, &tree, lang).map_err(|error| error.to_string())?;
2243
2244    Ok(symbols_to_chunks(file, &symbols, &source, project_root))
2245}
2246
2247/// Build a display snippet from a symbol's source
2248fn build_snippet(symbol: &Symbol, source: &str) -> String {
2249    let lines: Vec<&str> = source.lines().collect();
2250    let start = (symbol.range.start_line as usize).min(lines.len());
2251    // range.end_line is inclusive 0-based; +1 makes it an exclusive slice bound.
2252    let end = (symbol.range.end_line as usize + 1).min(lines.len());
2253    if start < end {
2254        let snippet_lines: Vec<&str> = lines[start..end].iter().take(5).copied().collect();
2255        let mut snippet = snippet_lines.join("\n");
2256        if end - start > 5 {
2257            snippet.push_str("\n  ...");
2258        }
2259        if snippet.len() > 300 {
2260            snippet = format!("{}...", &snippet[..snippet.floor_char_boundary(300)]);
2261        }
2262        snippet
2263    } else {
2264        String::new()
2265    }
2266}
2267
2268/// Convert symbols to semantic chunks with enriched context
2269fn symbols_to_chunks(
2270    file: &Path,
2271    symbols: &[Symbol],
2272    source: &str,
2273    project_root: &Path,
2274) -> Vec<SemanticChunk> {
2275    let mut chunks = Vec::new();
2276    let top_exports_with_signatures = symbols
2277        .iter()
2278        .filter(|symbol| {
2279            symbol.exported
2280                && symbol.parent.is_none()
2281                && !matches!(symbol.kind, SymbolKind::Heading)
2282        })
2283        .map(|symbol| (symbol.name.as_str(), symbol.signature.as_deref()))
2284        .collect::<Vec<_>>();
2285
2286    let has_only_headings = !symbols.is_empty()
2287        && symbols
2288            .iter()
2289            .all(|symbol| matches!(symbol.kind, SymbolKind::Heading));
2290    if top_exports_with_signatures.len() <= 2 && !has_only_headings {
2291        let top_exports = top_exports_with_signatures
2292            .iter()
2293            .map(|(name, _)| *name)
2294            .collect::<Vec<_>>();
2295        let top_export_signatures = top_exports_with_signatures
2296            .iter()
2297            .map(|(_, signature)| *signature)
2298            .collect::<Vec<_>>();
2299        chunks.push(build_file_summary_chunk(
2300            file,
2301            project_root,
2302            source,
2303            &top_exports,
2304            &top_export_signatures,
2305        ));
2306    }
2307
2308    for symbol in symbols {
2309        // Skip Markdown / HTML heading chunks: empirically they dominate result
2310        // lists even for code-shaped queries because heading prose embeds well.
2311        // Agents querying for code lose the actual matches under doc noise.
2312        // README/docs queries are still served by grep on the same files.
2313        if matches!(symbol.kind, SymbolKind::Heading) {
2314            continue;
2315        }
2316
2317        // Skip very small symbols (single-line variables, etc.)
2318        let line_count = symbol
2319            .range
2320            .end_line
2321            .saturating_sub(symbol.range.start_line)
2322            + 1;
2323        if line_count < 2 && !matches!(symbol.kind, SymbolKind::Variable) {
2324            continue;
2325        }
2326
2327        let embed_text = build_embed_text(symbol, source, file, project_root);
2328        let snippet = build_snippet(symbol, source);
2329
2330        chunks.push(SemanticChunk {
2331            file: file.to_path_buf(),
2332            name: symbol.name.clone(),
2333            kind: symbol.kind.clone(),
2334            start_line: symbol.range.start_line,
2335            end_line: symbol.range.end_line,
2336            exported: symbol.exported,
2337            embed_text,
2338            snippet,
2339        });
2340
2341        // Note: Nested symbols are handled separately by the outline system
2342        // Each symbol is indexed individually
2343    }
2344
2345    chunks
2346}
2347
2348/// Cosine similarity between two vectors
2349fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
2350    if a.len() != b.len() {
2351        return 0.0;
2352    }
2353
2354    let mut dot = 0.0f32;
2355    let mut norm_a = 0.0f32;
2356    let mut norm_b = 0.0f32;
2357
2358    for i in 0..a.len() {
2359        dot += a[i] * b[i];
2360        norm_a += a[i] * a[i];
2361        norm_b += b[i] * b[i];
2362    }
2363
2364    let denom = norm_a.sqrt() * norm_b.sqrt();
2365    if denom == 0.0 {
2366        0.0
2367    } else {
2368        dot / denom
2369    }
2370}
2371
2372// Serialization helpers
2373fn symbol_kind_to_u8(kind: &SymbolKind) -> u8 {
2374    match kind {
2375        SymbolKind::Function => 0,
2376        SymbolKind::Class => 1,
2377        SymbolKind::Method => 2,
2378        SymbolKind::Struct => 3,
2379        SymbolKind::Interface => 4,
2380        SymbolKind::Enum => 5,
2381        SymbolKind::TypeAlias => 6,
2382        SymbolKind::Variable => 7,
2383        SymbolKind::Heading => 8,
2384        SymbolKind::FileSummary => 9,
2385    }
2386}
2387
2388fn u8_to_symbol_kind(v: u8) -> SymbolKind {
2389    match v {
2390        0 => SymbolKind::Function,
2391        1 => SymbolKind::Class,
2392        2 => SymbolKind::Method,
2393        3 => SymbolKind::Struct,
2394        4 => SymbolKind::Interface,
2395        5 => SymbolKind::Enum,
2396        6 => SymbolKind::TypeAlias,
2397        7 => SymbolKind::Variable,
2398        8 => SymbolKind::Heading,
2399        9 => SymbolKind::FileSummary,
2400        _ => SymbolKind::Heading,
2401    }
2402}
2403
2404fn read_u32(data: &[u8], pos: &mut usize) -> Result<u32, String> {
2405    if *pos + 4 > data.len() {
2406        return Err("unexpected end of data reading u32".to_string());
2407    }
2408    let val = u32::from_le_bytes([data[*pos], data[*pos + 1], data[*pos + 2], data[*pos + 3]]);
2409    *pos += 4;
2410    Ok(val)
2411}
2412
2413fn read_u64(data: &[u8], pos: &mut usize) -> Result<u64, String> {
2414    if *pos + 8 > data.len() {
2415        return Err("unexpected end of data reading u64".to_string());
2416    }
2417    let bytes: [u8; 8] = data[*pos..*pos + 8].try_into().unwrap();
2418    *pos += 8;
2419    Ok(u64::from_le_bytes(bytes))
2420}
2421
2422fn read_string(data: &[u8], pos: &mut usize) -> Result<String, String> {
2423    let len = read_u32(data, pos)? as usize;
2424    if *pos + len > data.len() {
2425        return Err("unexpected end of data reading string".to_string());
2426    }
2427    let s = String::from_utf8_lossy(&data[*pos..*pos + len]).to_string();
2428    *pos += len;
2429    Ok(s)
2430}
2431
2432#[cfg(test)]
2433mod tests {
2434    use super::*;
2435    use crate::config::{SemanticBackend, SemanticBackendConfig};
2436    use crate::parser::FileParser;
2437    use std::io::{Read, Write};
2438    use std::net::TcpListener;
2439    use std::thread;
2440
2441    fn start_mock_http_server<F>(handler: F) -> (String, thread::JoinHandle<()>)
2442    where
2443        F: Fn(String, String, String) -> String + Send + 'static,
2444    {
2445        let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
2446        let addr = listener.local_addr().expect("local addr");
2447        let handle = thread::spawn(move || {
2448            let (mut stream, _) = listener.accept().expect("accept request");
2449            let mut buf = Vec::new();
2450            let mut chunk = [0u8; 4096];
2451            let mut header_end = None;
2452            let mut content_length = 0usize;
2453            loop {
2454                let n = stream.read(&mut chunk).expect("read request");
2455                if n == 0 {
2456                    break;
2457                }
2458                buf.extend_from_slice(&chunk[..n]);
2459                if header_end.is_none() {
2460                    if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
2461                        header_end = Some(pos + 4);
2462                        let headers = String::from_utf8_lossy(&buf[..pos + 4]);
2463                        for line in headers.lines() {
2464                            if let Some(value) = line.strip_prefix("Content-Length:") {
2465                                content_length = value.trim().parse::<usize>().unwrap_or(0);
2466                            }
2467                        }
2468                    }
2469                }
2470                if let Some(end) = header_end {
2471                    if buf.len() >= end + content_length {
2472                        break;
2473                    }
2474                }
2475            }
2476
2477            let end = header_end.expect("header terminator");
2478            let request = String::from_utf8_lossy(&buf[..end]).to_string();
2479            let body = String::from_utf8_lossy(&buf[end..end + content_length]).to_string();
2480            let mut lines = request.lines();
2481            let request_line = lines.next().expect("request line").to_string();
2482            let path = request_line
2483                .split_whitespace()
2484                .nth(1)
2485                .expect("request path")
2486                .to_string();
2487            let response_body = handler(request_line, path, body);
2488            let response = format!(
2489                "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
2490                response_body.len(),
2491                response_body
2492            );
2493            stream
2494                .write_all(response.as_bytes())
2495                .expect("write response");
2496        });
2497
2498        (format!("http://{}", addr), handle)
2499    }
2500
2501    fn test_vector_for_texts(texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
2502        Ok(texts.iter().map(|_| vec![1.0, 0.0, 0.0]).collect())
2503    }
2504
2505    fn write_rust_file(path: &Path, function_name: &str) {
2506        fs::write(
2507            path,
2508            format!("pub fn {function_name}() -> bool {{\n    true\n}}\n"),
2509        )
2510        .unwrap();
2511    }
2512
2513    fn build_test_index(project_root: &Path, files: &[PathBuf]) -> SemanticIndex {
2514        let mut embed = test_vector_for_texts;
2515        SemanticIndex::build(project_root, files, &mut embed, 8).unwrap()
2516    }
2517
2518    fn test_project_root() -> PathBuf {
2519        std::env::current_dir().unwrap()
2520    }
2521
2522    fn set_file_metadata(index: &mut SemanticIndex, file: &Path, mtime: SystemTime, size: u64) {
2523        index.file_mtimes.insert(file.to_path_buf(), mtime);
2524        index.file_sizes.insert(file.to_path_buf(), size);
2525        index
2526            .file_hashes
2527            .insert(file.to_path_buf(), cache_freshness::zero_hash());
2528    }
2529
2530    #[test]
2531    fn semantic_cache_serialization_skips_paths_outside_project_root() {
2532        let dir = tempfile::tempdir().expect("create temp dir");
2533        let project = fs::canonicalize(dir.path()).expect("canonical project");
2534        let outside = project.join("..").join("outside.rs");
2535        let mut index = SemanticIndex::new(project.clone(), 3);
2536        index
2537            .file_mtimes
2538            .insert(outside.clone(), SystemTime::UNIX_EPOCH);
2539        index.file_sizes.insert(outside.clone(), 1);
2540        index
2541            .file_hashes
2542            .insert(outside.clone(), cache_freshness::zero_hash());
2543        index.entries.push(EmbeddingEntry {
2544            chunk: SemanticChunk {
2545                file: outside,
2546                name: "outside".to_string(),
2547                kind: SymbolKind::Function,
2548                start_line: 0,
2549                end_line: 0,
2550                exported: false,
2551                embed_text: "outside".to_string(),
2552                snippet: "outside".to_string(),
2553            },
2554            vector: vec![1.0, 0.0, 0.0],
2555        });
2556
2557        let bytes = index.to_bytes();
2558        let loaded = SemanticIndex::from_bytes(&bytes, &project).expect("load serialized index");
2559        assert_eq!(loaded.entries.len(), 0);
2560        assert!(loaded.file_mtimes.is_empty());
2561    }
2562
2563    #[test]
2564    fn test_cosine_similarity_identical() {
2565        let a = vec![1.0, 0.0, 0.0];
2566        let b = vec![1.0, 0.0, 0.0];
2567        assert!((cosine_similarity(&a, &b) - 1.0).abs() < 0.001);
2568    }
2569
2570    #[test]
2571    fn test_cosine_similarity_orthogonal() {
2572        let a = vec![1.0, 0.0, 0.0];
2573        let b = vec![0.0, 1.0, 0.0];
2574        assert!(cosine_similarity(&a, &b).abs() < 0.001);
2575    }
2576
2577    #[test]
2578    fn test_cosine_similarity_opposite() {
2579        let a = vec![1.0, 0.0, 0.0];
2580        let b = vec![-1.0, 0.0, 0.0];
2581        assert!((cosine_similarity(&a, &b) + 1.0).abs() < 0.001);
2582    }
2583
2584    #[test]
2585    fn test_serialization_roundtrip() {
2586        let project_root = test_project_root();
2587        let file = project_root.join("src/main.rs");
2588        let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
2589        index.entries.push(EmbeddingEntry {
2590            chunk: SemanticChunk {
2591                file: file.clone(),
2592                name: "handle_request".to_string(),
2593                kind: SymbolKind::Function,
2594                start_line: 10,
2595                end_line: 25,
2596                exported: true,
2597                embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
2598                snippet: "fn handle_request() {\n  // ...\n}".to_string(),
2599            },
2600            vector: vec![0.1, 0.2, 0.3, 0.4],
2601        });
2602        index.dimension = 4;
2603        index
2604            .file_mtimes
2605            .insert(file.clone(), SystemTime::UNIX_EPOCH);
2606        index.file_sizes.insert(file, 0);
2607        index.set_fingerprint(SemanticIndexFingerprint {
2608            backend: "fastembed".to_string(),
2609            model: "all-MiniLM-L6-v2".to_string(),
2610            base_url: FALLBACK_BACKEND.to_string(),
2611            dimension: 4,
2612            chunking_version: default_chunking_version(),
2613        });
2614
2615        let bytes = index.to_bytes();
2616        let restored = SemanticIndex::from_bytes(&bytes, &project_root).unwrap();
2617
2618        assert_eq!(restored.entries.len(), 1);
2619        assert_eq!(restored.entries[0].chunk.name, "handle_request");
2620        assert_eq!(restored.entries[0].vector, vec![0.1, 0.2, 0.3, 0.4]);
2621        assert_eq!(restored.dimension, 4);
2622        assert_eq!(restored.backend_label(), Some("fastembed"));
2623        assert_eq!(restored.model_label(), Some("all-MiniLM-L6-v2"));
2624    }
2625
2626    #[test]
2627    fn symbol_kind_serialization_roundtrip_includes_file_summary_variant() {
2628        let cases = [
2629            (SymbolKind::Function, 0),
2630            (SymbolKind::Class, 1),
2631            (SymbolKind::Method, 2),
2632            (SymbolKind::Struct, 3),
2633            (SymbolKind::Interface, 4),
2634            (SymbolKind::Enum, 5),
2635            (SymbolKind::TypeAlias, 6),
2636            (SymbolKind::Variable, 7),
2637            (SymbolKind::Heading, 8),
2638            (SymbolKind::FileSummary, 9),
2639        ];
2640
2641        for (kind, encoded) in cases {
2642            assert_eq!(symbol_kind_to_u8(&kind), encoded);
2643            assert_eq!(u8_to_symbol_kind(encoded), kind);
2644        }
2645    }
2646
2647    #[test]
2648    fn test_search_top_k() {
2649        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
2650        index.dimension = 3;
2651
2652        // Add entries with known vectors
2653        for (i, name) in ["auth", "database", "handler"].iter().enumerate() {
2654            let mut vec = vec![0.0f32; 3];
2655            vec[i] = 1.0; // orthogonal vectors
2656            index.entries.push(EmbeddingEntry {
2657                chunk: SemanticChunk {
2658                    file: PathBuf::from("/src/lib.rs"),
2659                    name: name.to_string(),
2660                    kind: SymbolKind::Function,
2661                    start_line: (i * 10 + 1) as u32,
2662                    end_line: (i * 10 + 5) as u32,
2663                    exported: true,
2664                    embed_text: format!("kind:function name:{}", name),
2665                    snippet: format!("fn {}() {{}}", name),
2666                },
2667                vector: vec,
2668            });
2669        }
2670
2671        // Query aligned with "auth" (index 0)
2672        let query = vec![0.9, 0.1, 0.0];
2673        let results = index.search(&query, 2);
2674
2675        assert_eq!(results.len(), 2);
2676        assert_eq!(results[0].name, "auth"); // highest score
2677        assert!(results[0].score > results[1].score);
2678    }
2679
2680    #[test]
2681    fn test_empty_index_search() {
2682        let index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
2683        let results = index.search(&[0.1, 0.2, 0.3], 10);
2684        assert!(results.is_empty());
2685    }
2686
2687    #[test]
2688    fn single_line_symbol_builds_non_empty_snippet() {
2689        let symbol = Symbol {
2690            name: "answer".to_string(),
2691            kind: SymbolKind::Variable,
2692            range: crate::symbols::Range {
2693                start_line: 0,
2694                start_col: 0,
2695                end_line: 0,
2696                end_col: 24,
2697            },
2698            signature: Some("const answer = 42".to_string()),
2699            scope_chain: Vec::new(),
2700            exported: true,
2701            parent: None,
2702        };
2703        let source = "export const answer = 42;\n";
2704
2705        let snippet = build_snippet(&symbol, source);
2706
2707        assert_eq!(snippet, "export const answer = 42;");
2708    }
2709
2710    #[test]
2711    fn optimized_file_chunk_collection_matches_file_parser_path() {
2712        let project_root = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
2713        let file = project_root.join("src/semantic_index.rs");
2714        let source = std::fs::read_to_string(&file).unwrap();
2715
2716        let mut legacy_parser = FileParser::new();
2717        let legacy_symbols = legacy_parser.extract_symbols(&file).unwrap();
2718        let legacy_chunks = symbols_to_chunks(&file, &legacy_symbols, &source, &project_root);
2719
2720        let mut parsers = HashMap::new();
2721        let optimized_chunks = collect_file_chunks(&project_root, &file, &mut parsers).unwrap();
2722
2723        assert_eq!(
2724            chunk_fingerprint(&optimized_chunks),
2725            chunk_fingerprint(&legacy_chunks)
2726        );
2727    }
2728
2729    fn chunk_fingerprint(
2730        chunks: &[SemanticChunk],
2731    ) -> Vec<(String, SymbolKind, u32, u32, bool, String, String)> {
2732        chunks
2733            .iter()
2734            .map(|chunk| {
2735                (
2736                    chunk.name.clone(),
2737                    chunk.kind.clone(),
2738                    chunk.start_line,
2739                    chunk.end_line,
2740                    chunk.exported,
2741                    chunk.embed_text.clone(),
2742                    chunk.snippet.clone(),
2743                )
2744            })
2745            .collect()
2746    }
2747
2748    #[test]
2749    fn rejects_oversized_dimension_during_deserialization() {
2750        let mut bytes = Vec::new();
2751        bytes.push(1u8);
2752        bytes.extend_from_slice(&((MAX_DIMENSION as u32) + 1).to_le_bytes());
2753        bytes.extend_from_slice(&0u32.to_le_bytes());
2754        bytes.extend_from_slice(&0u32.to_le_bytes());
2755
2756        assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
2757    }
2758
2759    #[test]
2760    fn rejects_oversized_entry_count_during_deserialization() {
2761        let mut bytes = Vec::new();
2762        bytes.push(1u8);
2763        bytes.extend_from_slice(&(DEFAULT_DIMENSION as u32).to_le_bytes());
2764        bytes.extend_from_slice(&((MAX_ENTRIES as u32) + 1).to_le_bytes());
2765        bytes.extend_from_slice(&0u32.to_le_bytes());
2766
2767        assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
2768    }
2769
2770    #[test]
2771    fn invalidate_file_removes_entries_and_mtime() {
2772        let target = PathBuf::from("/src/main.rs");
2773        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
2774        index.entries.push(EmbeddingEntry {
2775            chunk: SemanticChunk {
2776                file: target.clone(),
2777                name: "main".to_string(),
2778                kind: SymbolKind::Function,
2779                start_line: 0,
2780                end_line: 1,
2781                exported: false,
2782                embed_text: "main".to_string(),
2783                snippet: "fn main() {}".to_string(),
2784            },
2785            vector: vec![1.0; DEFAULT_DIMENSION],
2786        });
2787        index
2788            .file_mtimes
2789            .insert(target.clone(), SystemTime::UNIX_EPOCH);
2790        index.file_sizes.insert(target.clone(), 0);
2791
2792        index.invalidate_file(&target);
2793
2794        assert!(index.entries.is_empty());
2795        assert!(!index.file_mtimes.contains_key(&target));
2796        assert!(!index.file_sizes.contains_key(&target));
2797    }
2798
2799    #[test]
2800    fn refresh_transient_error_preserves_existing_entry_and_mtime() {
2801        let temp = tempfile::tempdir().unwrap();
2802        let project_root = temp.path();
2803        let file = project_root.join("src/lib.rs");
2804        fs::create_dir_all(file.parent().unwrap()).unwrap();
2805        write_rust_file(&file, "kept_symbol");
2806
2807        let mut index = build_test_index(project_root, std::slice::from_ref(&file));
2808        let original_entry_count = index.entries.len();
2809        let original_mtime = *index.file_mtimes.get(&file).unwrap();
2810        let original_size = *index.file_sizes.get(&file).unwrap();
2811
2812        let stale_mtime = SystemTime::UNIX_EPOCH;
2813        set_file_metadata(&mut index, &file, stale_mtime, original_size + 1);
2814        fs::remove_file(&file).unwrap();
2815
2816        let mut embed = test_vector_for_texts;
2817        let mut progress = |_done: usize, _total: usize| {};
2818        let summary = index
2819            .refresh_stale_files(
2820                project_root,
2821                std::slice::from_ref(&file),
2822                &mut embed,
2823                8,
2824                &mut progress,
2825            )
2826            .unwrap();
2827
2828        assert_eq!(summary.changed, 0);
2829        assert_eq!(summary.added, 0);
2830        assert_eq!(summary.deleted, 0);
2831        assert_eq!(index.entries.len(), original_entry_count);
2832        assert!(index
2833            .entries
2834            .iter()
2835            .any(|entry| entry.chunk.name == "kept_symbol"));
2836        assert_eq!(index.file_mtimes.get(&file), Some(&stale_mtime));
2837        assert_ne!(index.file_mtimes.get(&file), Some(&original_mtime));
2838        assert_eq!(index.file_sizes.get(&file), Some(&(original_size + 1)));
2839    }
2840
2841    #[test]
2842    fn refresh_never_indexed_file_error_does_not_record_mtime() {
2843        let temp = tempfile::tempdir().unwrap();
2844        let project_root = temp.path();
2845        let missing = project_root.join("src/missing.rs");
2846        fs::create_dir_all(missing.parent().unwrap()).unwrap();
2847
2848        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
2849        let mut embed = test_vector_for_texts;
2850        let mut progress = |_done: usize, _total: usize| {};
2851        let summary = index
2852            .refresh_stale_files(
2853                project_root,
2854                std::slice::from_ref(&missing),
2855                &mut embed,
2856                8,
2857                &mut progress,
2858            )
2859            .unwrap();
2860
2861        assert_eq!(summary.added, 0);
2862        assert_eq!(summary.changed, 0);
2863        assert_eq!(summary.deleted, 0);
2864        assert!(!index.file_mtimes.contains_key(&missing));
2865        assert!(!index.file_sizes.contains_key(&missing));
2866        assert!(index.entries.is_empty());
2867    }
2868
2869    #[test]
2870    fn refresh_reports_added_for_new_files() {
2871        let temp = tempfile::tempdir().unwrap();
2872        let project_root = temp.path();
2873        let existing = project_root.join("src/lib.rs");
2874        let added = project_root.join("src/new.rs");
2875        fs::create_dir_all(existing.parent().unwrap()).unwrap();
2876        write_rust_file(&existing, "existing_symbol");
2877        write_rust_file(&added, "added_symbol");
2878
2879        let mut index = build_test_index(project_root, std::slice::from_ref(&existing));
2880        let mut embed = test_vector_for_texts;
2881        let mut progress = |_done: usize, _total: usize| {};
2882        let summary = index
2883            .refresh_stale_files(
2884                project_root,
2885                &[existing.clone(), added.clone()],
2886                &mut embed,
2887                8,
2888                &mut progress,
2889            )
2890            .unwrap();
2891
2892        assert_eq!(summary.added, 1);
2893        assert_eq!(summary.changed, 0);
2894        assert_eq!(summary.deleted, 0);
2895        assert_eq!(summary.total_processed, 2);
2896        assert!(index.file_mtimes.contains_key(&added));
2897        assert!(index.entries.iter().any(|entry| entry.chunk.file == added));
2898    }
2899
2900    #[test]
2901    fn refresh_reports_deleted_for_removed_files() {
2902        let temp = tempfile::tempdir().unwrap();
2903        let project_root = temp.path();
2904        let deleted = project_root.join("src/deleted.rs");
2905        fs::create_dir_all(deleted.parent().unwrap()).unwrap();
2906        write_rust_file(&deleted, "deleted_symbol");
2907
2908        let mut index = build_test_index(project_root, std::slice::from_ref(&deleted));
2909        fs::remove_file(&deleted).unwrap();
2910
2911        let mut embed = test_vector_for_texts;
2912        let mut progress = |_done: usize, _total: usize| {};
2913        let summary = index
2914            .refresh_stale_files(project_root, &[], &mut embed, 8, &mut progress)
2915            .unwrap();
2916
2917        assert_eq!(summary.deleted, 1);
2918        assert_eq!(summary.changed, 0);
2919        assert_eq!(summary.added, 0);
2920        assert_eq!(summary.total_processed, 1);
2921        assert!(!index.file_mtimes.contains_key(&deleted));
2922        assert!(index.entries.is_empty());
2923    }
2924
2925    #[test]
2926    fn refresh_reports_changed_for_modified_files() {
2927        let temp = tempfile::tempdir().unwrap();
2928        let project_root = temp.path();
2929        let file = project_root.join("src/lib.rs");
2930        fs::create_dir_all(file.parent().unwrap()).unwrap();
2931        write_rust_file(&file, "old_symbol");
2932
2933        let mut index = build_test_index(project_root, std::slice::from_ref(&file));
2934        set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, 0);
2935        write_rust_file(&file, "new_symbol");
2936
2937        let mut embed = test_vector_for_texts;
2938        let mut progress = |_done: usize, _total: usize| {};
2939        let summary = index
2940            .refresh_stale_files(
2941                project_root,
2942                std::slice::from_ref(&file),
2943                &mut embed,
2944                8,
2945                &mut progress,
2946            )
2947            .unwrap();
2948
2949        assert_eq!(summary.changed, 1);
2950        assert_eq!(summary.added, 0);
2951        assert_eq!(summary.deleted, 0);
2952        assert_eq!(summary.total_processed, 1);
2953        assert!(index
2954            .entries
2955            .iter()
2956            .any(|entry| entry.chunk.name == "new_symbol"));
2957        assert!(!index
2958            .entries
2959            .iter()
2960            .any(|entry| entry.chunk.name == "old_symbol"));
2961    }
2962
2963    #[test]
2964    fn refresh_all_clean_reports_zero_counts_and_no_embedding_work() {
2965        let temp = tempfile::tempdir().unwrap();
2966        let project_root = temp.path();
2967        let file = project_root.join("src/lib.rs");
2968        fs::create_dir_all(file.parent().unwrap()).unwrap();
2969        write_rust_file(&file, "clean_symbol");
2970
2971        let mut index = build_test_index(project_root, std::slice::from_ref(&file));
2972        let original_entries = index.entries.len();
2973        let mut embed_called = false;
2974        let mut embed = |texts: Vec<String>| {
2975            embed_called = true;
2976            test_vector_for_texts(texts)
2977        };
2978        let mut progress = |_done: usize, _total: usize| {};
2979        let summary = index
2980            .refresh_stale_files(
2981                project_root,
2982                std::slice::from_ref(&file),
2983                &mut embed,
2984                8,
2985                &mut progress,
2986            )
2987            .unwrap();
2988
2989        assert!(summary.is_noop());
2990        assert_eq!(summary.total_processed, 1);
2991        assert!(!embed_called);
2992        assert_eq!(index.entries.len(), original_entries);
2993    }
2994
2995    #[test]
2996    fn detects_missing_onnx_runtime_from_dynamic_load_error() {
2997        let message = "Failed to load ONNX Runtime shared library libonnxruntime.dylib via dlopen: no such file";
2998
2999        assert!(is_onnx_runtime_unavailable(message));
3000    }
3001
3002    #[test]
3003    fn formats_missing_onnx_runtime_with_install_hint() {
3004        let message = format_embedding_init_error(
3005            "Failed to load ONNX Runtime shared library libonnxruntime.so via dlopen: no such file",
3006        );
3007
3008        assert!(message.starts_with("ONNX Runtime not found. Install via:"));
3009        assert!(message.contains("Original error:"));
3010    }
3011
3012    #[test]
3013    fn openai_compatible_backend_embeds_with_mock_server() {
3014        let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
3015            assert!(request_line.starts_with("POST "));
3016            assert_eq!(path, "/v1/embeddings");
3017            "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0},{\"embedding\":[0.4,0.5,0.6],\"index\":1}]}".to_string()
3018        });
3019
3020        let config = SemanticBackendConfig {
3021            backend: SemanticBackend::OpenAiCompatible,
3022            model: "test-embedding".to_string(),
3023            base_url: Some(base_url),
3024            api_key_env: None,
3025            timeout_ms: 5_000,
3026            max_batch_size: 64,
3027        };
3028
3029        let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
3030        let vectors = model
3031            .embed(vec!["hello".to_string(), "world".to_string()])
3032            .unwrap();
3033
3034        assert_eq!(vectors, vec![vec![0.1, 0.2, 0.3], vec![0.4, 0.5, 0.6]]);
3035        handle.join().unwrap();
3036    }
3037
3038    /// Regression for issue #36: AFT was sending TWO Content-Type headers
3039    /// on the OpenAI embeddings request — once implicitly via `.json(&body)`
3040    /// and again explicitly via `.header("Content-Type", "application/json")`.
3041    /// reqwest's `.header()` calls `HeaderMap::append`, which produces two
3042    /// headers on the wire. OpenAI's /v1/embeddings endpoint rejects that
3043    /// with `HTTP 400 "you must provide a model parameter"` even though the
3044    /// body actually contains `model`. The fix is to drop the explicit
3045    /// `.header("Content-Type", ...)` call. This test pins that we send
3046    /// exactly one Content-Type header.
3047    #[test]
3048    fn openai_compatible_request_has_single_content_type_header() {
3049        use std::sync::{Arc, Mutex};
3050        let captured: Arc<Mutex<Vec<u8>>> = Arc::new(Mutex::new(Vec::new()));
3051        let captured_for_thread = Arc::clone(&captured);
3052
3053        let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
3054        let addr = listener.local_addr().expect("local addr");
3055        let handle = thread::spawn(move || {
3056            let (mut stream, _) = listener.accept().expect("accept");
3057            let mut buf = Vec::new();
3058            let mut chunk = [0u8; 4096];
3059            let mut header_end = None;
3060            let mut content_length = 0usize;
3061            loop {
3062                let n = stream.read(&mut chunk).expect("read");
3063                if n == 0 {
3064                    break;
3065                }
3066                buf.extend_from_slice(&chunk[..n]);
3067                if header_end.is_none() {
3068                    if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
3069                        header_end = Some(pos + 4);
3070                        for line in String::from_utf8_lossy(&buf[..pos + 4]).lines() {
3071                            if let Some(value) = line.strip_prefix("Content-Length:") {
3072                                content_length = value.trim().parse::<usize>().unwrap_or(0);
3073                            }
3074                        }
3075                    }
3076                }
3077                if let Some(end) = header_end {
3078                    if buf.len() >= end + content_length {
3079                        break;
3080                    }
3081                }
3082            }
3083            *captured_for_thread.lock().unwrap() = buf;
3084            let body = "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0}]}";
3085            let response = format!(
3086                "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
3087                body.len(),
3088                body
3089            );
3090            let _ = stream.write_all(response.as_bytes());
3091        });
3092
3093        let config = SemanticBackendConfig {
3094            backend: SemanticBackend::OpenAiCompatible,
3095            model: "text-embedding-3-small".to_string(),
3096            base_url: Some(format!("http://{}", addr)),
3097            api_key_env: None,
3098            timeout_ms: 5_000,
3099            max_batch_size: 64,
3100        };
3101        let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
3102        let _ = model.embed(vec!["probe".to_string()]).unwrap();
3103        handle.join().unwrap();
3104
3105        let bytes = captured.lock().unwrap().clone();
3106        let request = String::from_utf8_lossy(&bytes);
3107
3108        // Lowercase line counts because HTTP headers are case-insensitive
3109        // and reqwest may emit `content-type` in lowercase under HTTP/2.
3110        let content_type_lines = request
3111            .lines()
3112            .filter(|line| {
3113                let lower = line.to_ascii_lowercase();
3114                lower.starts_with("content-type:")
3115            })
3116            .count();
3117        assert_eq!(
3118            content_type_lines, 1,
3119            "expected exactly one Content-Type header but found {content_type_lines}; full request:\n{request}",
3120        );
3121
3122        // The body must still include the model field — pin this so a future
3123        // change can't accidentally drop `model` while fixing duplicate headers.
3124        assert!(
3125            request.contains(r#""model":"text-embedding-3-small""#),
3126            "request body should contain model field; full request:\n{request}",
3127        );
3128    }
3129
3130    #[test]
3131    fn ollama_backend_embeds_with_mock_server() {
3132        let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
3133            assert!(request_line.starts_with("POST "));
3134            assert_eq!(path, "/api/embed");
3135            "{\"embeddings\":[[0.7,0.8,0.9],[1.0,1.1,1.2]]}".to_string()
3136        });
3137
3138        let config = SemanticBackendConfig {
3139            backend: SemanticBackend::Ollama,
3140            model: "embeddinggemma".to_string(),
3141            base_url: Some(base_url),
3142            api_key_env: None,
3143            timeout_ms: 5_000,
3144            max_batch_size: 64,
3145        };
3146
3147        let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
3148        let vectors = model
3149            .embed(vec!["hello".to_string(), "world".to_string()])
3150            .unwrap();
3151
3152        assert_eq!(vectors, vec![vec![0.7, 0.8, 0.9], vec![1.0, 1.1, 1.2]]);
3153        handle.join().unwrap();
3154    }
3155
3156    #[test]
3157    fn read_from_disk_rejects_fingerprint_mismatch() {
3158        let storage = tempfile::tempdir().unwrap();
3159        let project_key = "proj";
3160
3161        let project_root = test_project_root();
3162        let file = project_root.join("src/main.rs");
3163        let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
3164        index.entries.push(EmbeddingEntry {
3165            chunk: SemanticChunk {
3166                file: file.clone(),
3167                name: "handle_request".to_string(),
3168                kind: SymbolKind::Function,
3169                start_line: 10,
3170                end_line: 25,
3171                exported: true,
3172                embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
3173                snippet: "fn handle_request() {}".to_string(),
3174            },
3175            vector: vec![0.1, 0.2, 0.3],
3176        });
3177        index.dimension = 3;
3178        index
3179            .file_mtimes
3180            .insert(file.clone(), SystemTime::UNIX_EPOCH);
3181        index.file_sizes.insert(file, 0);
3182        index.set_fingerprint(SemanticIndexFingerprint {
3183            backend: "openai_compatible".to_string(),
3184            model: "test-embedding".to_string(),
3185            base_url: "http://127.0.0.1:1234/v1".to_string(),
3186            dimension: 3,
3187            chunking_version: default_chunking_version(),
3188        });
3189        index.write_to_disk(storage.path(), project_key);
3190
3191        let matching = index.fingerprint().unwrap().as_string();
3192        assert!(SemanticIndex::read_from_disk(
3193            storage.path(),
3194            project_key,
3195            &project_root,
3196            false,
3197            Some(&matching),
3198        )
3199        .is_some());
3200
3201        let mismatched = SemanticIndexFingerprint {
3202            backend: "ollama".to_string(),
3203            model: "embeddinggemma".to_string(),
3204            base_url: "http://127.0.0.1:11434".to_string(),
3205            dimension: 3,
3206            chunking_version: default_chunking_version(),
3207        }
3208        .as_string();
3209        assert!(SemanticIndex::read_from_disk(
3210            storage.path(),
3211            project_key,
3212            &project_root,
3213            false,
3214            Some(&mismatched),
3215        )
3216        .is_none());
3217    }
3218
3219    #[test]
3220    fn read_from_disk_rejects_v3_cache_for_snippet_rebuild() {
3221        let storage = tempfile::tempdir().unwrap();
3222        let project_key = "proj-v3";
3223        let dir = storage.path().join("semantic").join(project_key);
3224        fs::create_dir_all(&dir).unwrap();
3225
3226        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3227        index.entries.push(EmbeddingEntry {
3228            chunk: SemanticChunk {
3229                file: PathBuf::from("/src/main.rs"),
3230                name: "handle_request".to_string(),
3231                kind: SymbolKind::Function,
3232                start_line: 0,
3233                end_line: 0,
3234                exported: true,
3235                embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
3236                snippet: "fn handle_request() {}".to_string(),
3237            },
3238            vector: vec![0.1, 0.2, 0.3],
3239        });
3240        index.dimension = 3;
3241        index
3242            .file_mtimes
3243            .insert(PathBuf::from("/src/main.rs"), SystemTime::UNIX_EPOCH);
3244        index.file_sizes.insert(PathBuf::from("/src/main.rs"), 0);
3245        let fingerprint = SemanticIndexFingerprint {
3246            backend: "fastembed".to_string(),
3247            model: "test".to_string(),
3248            base_url: FALLBACK_BACKEND.to_string(),
3249            dimension: 3,
3250            chunking_version: default_chunking_version(),
3251        };
3252        index.set_fingerprint(fingerprint.clone());
3253
3254        let mut bytes = index.to_bytes();
3255        bytes[0] = SEMANTIC_INDEX_VERSION_V3;
3256        fs::write(dir.join("semantic.bin"), bytes).unwrap();
3257
3258        assert!(SemanticIndex::read_from_disk(
3259            storage.path(),
3260            project_key,
3261            &test_project_root(),
3262            false,
3263            Some(&fingerprint.as_string())
3264        )
3265        .is_none());
3266        assert!(!dir.join("semantic.bin").exists());
3267    }
3268
3269    fn make_symbol(kind: SymbolKind, name: &str, start: u32, end: u32) -> crate::symbols::Symbol {
3270        crate::symbols::Symbol {
3271            name: name.to_string(),
3272            kind,
3273            range: crate::symbols::Range {
3274                start_line: start,
3275                start_col: 0,
3276                end_line: end,
3277                end_col: 0,
3278            },
3279            signature: None,
3280            scope_chain: Vec::new(),
3281            exported: false,
3282            parent: None,
3283        }
3284    }
3285
3286    /// Heading symbols (Markdown / HTML headings) must NOT be indexed —
3287    /// they overwhelmingly dominated semantic results even on code-shaped
3288    /// queries because heading prose embeds far more strongly than code
3289    /// chunks. Skipping headings keeps aft_search a code-finder.
3290    #[test]
3291    fn symbols_to_chunks_skips_heading_symbols() {
3292        let project_root = PathBuf::from("/proj");
3293        let file = project_root.join("README.md");
3294        let source = "# Title\n\nbody text\n\n## Section\n\nmore text\n";
3295
3296        let symbols = vec![
3297            make_symbol(SymbolKind::Heading, "Title", 0, 2),
3298            make_symbol(SymbolKind::Heading, "Section", 4, 6),
3299        ];
3300
3301        let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
3302        assert!(
3303            chunks.is_empty(),
3304            "Heading symbols must be filtered out before embedding; got {} chunk(s)",
3305            chunks.len()
3306        );
3307    }
3308
3309    /// Code symbols (functions, classes, methods, structs, etc.) must still
3310    /// be indexed alongside the heading skip — otherwise we'd starve the
3311    /// index entirely.
3312    #[test]
3313    fn symbols_to_chunks_keeps_code_symbols_alongside_skipped_headings() {
3314        let project_root = PathBuf::from("/proj");
3315        let file = project_root.join("src/lib.rs");
3316        let source = "pub fn handle_request() -> bool {\n    true\n}\n";
3317
3318        let symbols = vec![
3319            // A heading mixed in (e.g. from a doc comment block elsewhere).
3320            make_symbol(SymbolKind::Heading, "doc heading", 0, 1),
3321            make_symbol(SymbolKind::Function, "handle_request", 0, 2),
3322            make_symbol(SymbolKind::Struct, "AuthService", 4, 6),
3323        ];
3324
3325        let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
3326        assert_eq!(
3327            chunks.len(),
3328            3,
3329            "Expected file-summary + 2 code chunks (Function + Struct), got {}",
3330            chunks.len()
3331        );
3332        let names: Vec<&str> = chunks.iter().map(|c| c.name.as_str()).collect();
3333        assert!(chunks
3334            .iter()
3335            .any(|chunk| matches!(chunk.kind, SymbolKind::FileSummary)));
3336        assert!(names.contains(&"handle_request"));
3337        assert!(names.contains(&"AuthService"));
3338        assert!(
3339            !names.contains(&"doc heading"),
3340            "Heading symbol leaked into chunks: {names:?}"
3341        );
3342    }
3343
3344    #[test]
3345    fn validate_ssrf_allows_loopback_hostnames() {
3346        // Loopback hostnames are explicitly allowed so self-hosted backends
3347        // (Ollama at http://localhost:11434) work at their default config.
3348        for host in &[
3349            "http://localhost",
3350            "http://localhost:8080",
3351            "http://localhost:11434", // Ollama default
3352            "http://localhost.localdomain",
3353            "http://foo.localhost",
3354        ] {
3355            assert!(
3356                validate_base_url_no_ssrf(host).is_ok(),
3357                "Expected {host} to be allowed (loopback), got: {:?}",
3358                validate_base_url_no_ssrf(host)
3359            );
3360        }
3361    }
3362
3363    #[test]
3364    fn validate_ssrf_allows_loopback_ips() {
3365        // 127.0.0.0/8 is loopback — by definition same-machine and not an
3366        // SSRF target. Allow it so Ollama at http://127.0.0.1:11434 works.
3367        for url in &[
3368            "http://127.0.0.1",
3369            "http://127.0.0.1:11434", // Ollama default
3370            "http://127.0.0.1:8080",
3371            "http://127.1.2.3",
3372        ] {
3373            let result = validate_base_url_no_ssrf(url);
3374            assert!(
3375                result.is_ok(),
3376                "Expected {url} to be allowed (loopback), got: {:?}",
3377                result
3378            );
3379        }
3380    }
3381
3382    #[test]
3383    fn validate_ssrf_rejects_private_non_loopback_ips() {
3384        // Non-loopback private/reserved IPs remain rejected — homelab/intranet
3385        // services on LAN IPs are real SSRF targets even though the user
3386        // configured them. Users who want this can opt in by binding the
3387        // service to a public-routable address.
3388        for url in &[
3389            "http://192.168.1.1",
3390            "http://10.0.0.1",
3391            "http://172.16.0.1",
3392            "http://169.254.169.254",
3393            "http://100.64.0.1",
3394        ] {
3395            let result = validate_base_url_no_ssrf(url);
3396            assert!(
3397                result.is_err(),
3398                "Expected {url} to be rejected (non-loopback private), got: {:?}",
3399                result
3400            );
3401        }
3402    }
3403
3404    #[test]
3405    fn validate_ssrf_rejects_mdns_local_hostnames() {
3406        // mDNS .local hostnames typically resolve to LAN devices, not
3407        // loopback. Rejecting them before DNS lookup gives a clearer error.
3408        for host in &[
3409            "http://printer.local",
3410            "http://nas.local:8080",
3411            "http://homelab.local",
3412        ] {
3413            let result = validate_base_url_no_ssrf(host);
3414            assert!(
3415                result.is_err(),
3416                "Expected {host} to be rejected (mDNS), got: {:?}",
3417                result
3418            );
3419        }
3420    }
3421
3422    #[test]
3423    fn normalize_base_url_allows_localhost_for_tests() {
3424        // normalize_base_url itself should NOT block localhost — only
3425        // validate_base_url_no_ssrf does. Tests construct backends directly.
3426        assert!(normalize_base_url("http://127.0.0.1:9999").is_ok());
3427        assert!(normalize_base_url("http://localhost:8080").is_ok());
3428    }
3429
3430    /// Pin the user-facing wording of the ONNX version-mismatch error.
3431    /// The auto-fix path MUST be listed first because it's the only safe
3432    /// option that doesn't require sudo or risk breaking other apps that
3433    /// link the system library. Regression of any of these strings would
3434    /// either mislead users (system rm before auto-fix) or break the
3435    /// `aft doctor --fix` discovery path.
3436    #[test]
3437    fn ort_mismatch_message_recommends_auto_fix_first() {
3438        let msg =
3439            format_ort_version_mismatch("1.9.0", "/usr/lib/x86_64-linux-gnu/libonnxruntime.so");
3440
3441        // The reported version and path must appear verbatim.
3442        assert!(
3443            msg.contains("v1.9.0"),
3444            "should report detected version: {msg}"
3445        );
3446        assert!(
3447            msg.contains("/usr/lib/x86_64-linux-gnu/libonnxruntime.so"),
3448            "should report system path: {msg}"
3449        );
3450        assert!(msg.contains("v1.20+"), "should state requirement: {msg}");
3451
3452        // Solution ordering: auto-fix is #1, system rm is #2, install is #3.
3453        let auto_fix_pos = msg
3454            .find("Auto-fix")
3455            .expect("Auto-fix solution missing — users won't discover --fix");
3456        let remove_pos = msg
3457            .find("Remove the old library")
3458            .expect("system-rm solution missing");
3459        assert!(
3460            auto_fix_pos < remove_pos,
3461            "Auto-fix must come before manual rm — see PR comment thread"
3462        );
3463
3464        // The auto-fix command must be runnable as-is on a fresh system.
3465        assert!(
3466            msg.contains("npx @cortexkit/aft doctor --fix"),
3467            "auto-fix command must be present and copy-pasteable: {msg}"
3468        );
3469    }
3470
3471    /// macOS dylib paths must not produce a malformed message when the
3472    /// system path lacks a trailing slash. This is a regression guard
3473    /// for the "{}\n{}" format string contract.
3474    #[test]
3475    fn ort_mismatch_message_handles_macos_dylib_path() {
3476        let msg = format_ort_version_mismatch("1.9.0", "/opt/homebrew/lib/libonnxruntime.dylib");
3477        assert!(msg.contains("v1.9.0"));
3478        assert!(msg.contains("/opt/homebrew/lib/libonnxruntime.dylib"));
3479        // The dylib path must appear in the auto-fix paragraph (single
3480        // quotes around it) AND in the manual-rm paragraph; verify
3481        // both placements survived the format string.
3482        assert!(
3483            msg.contains("'/opt/homebrew/lib/libonnxruntime.dylib'"),
3484            "system path should be quoted in the auto-fix sentence: {msg}"
3485        );
3486    }
3487}