aft/
semantic_index.rs

1use crate::cache_freshness::{self, FileFreshness, FreshnessVerdict};
2use crate::config::{SemanticBackend, SemanticBackendConfig};
3use crate::fs_lock;
4use crate::parser::{detect_language, extract_symbols_from_tree, grammar_for};
5use crate::search_index::{cache_relative_path, cached_path_under_root};
6use crate::symbols::{Symbol, SymbolKind};
7use crate::{slog_info, slog_warn};
8
9use fastembed::{EmbeddingModel as FastembedEmbeddingModel, InitOptions, TextEmbedding};
10use rayon::prelude::*;
11use reqwest::blocking::Client;
12use serde::{Deserialize, Serialize};
13use std::collections::{HashMap, HashSet, VecDeque};
14use std::env;
15use std::fmt::Display;
16use std::fs;
17use std::path::{Path, PathBuf};
18use std::sync::Mutex;
19use std::time::Duration;
20use std::time::SystemTime;
21use tree_sitter::Parser;
22use url::Url;
23
24const DEFAULT_DIMENSION: usize = 384;
25const MAX_ENTRIES: usize = 1_000_000;
26// Covers high-dimensional backends such as OpenAI text-embedding-3-large (3072)
27// and common local models (4096) while keeping a bounded supported shape.
28const MAX_DIMENSION: usize = 4096;
29const F32_BYTES: usize = std::mem::size_of::<f32>();
30const HEADER_BYTES_V1: usize = 9;
31const HEADER_BYTES_V2: usize = 13;
32const ONNX_RUNTIME_INSTALL_HINT: &str =
33    "ONNX Runtime not found. Install via: brew install onnxruntime (macOS) or apt install libonnxruntime (Linux).";
34
35const SEMANTIC_INDEX_VERSION_V1: u8 = 1;
36const SEMANTIC_INDEX_VERSION_V2: u8 = 2;
37/// V3 adds subsec_nanos to the file-mtime table so staleness detection survives
38/// restart round-trips on filesystems with subsecond mtime precision (APFS,
39/// ext4 with nsec, NTFS). V1/V2 persisted whole-second mtimes only, which
40/// caused every restart to flag ~99% of files as stale and re-embed them.
41const SEMANTIC_INDEX_VERSION_V3: u8 = 3;
42/// V4 keeps the V3 on-disk layout but rebuilds persisted snippets once after
43/// fixing symbol ranges that were incorrectly treated as 1-based.
44const SEMANTIC_INDEX_VERSION_V4: u8 = 4;
45/// V5 adds file sizes to the file metadata table so incremental staleness
46/// detection can catch content changes even when mtime precision misses them.
47const SEMANTIC_INDEX_VERSION_V5: u8 = 5;
48/// V6 stores paths relative to project_root and adds content hashes.
49const SEMANTIC_INDEX_VERSION_V6: u8 = 6;
50const DEFAULT_OPENAI_EMBEDDING_PATH: &str = "/embeddings";
51const DEFAULT_OLLAMA_EMBEDDING_PATH: &str = "/api/embed";
52// Must stay below the bridge timeout (30s) to avoid bridge kills on slow backends.
53const DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS: u64 = 25_000;
54const DEFAULT_MAX_BATCH_SIZE: usize = 64;
55const QUERY_EMBEDDING_CACHE_CAP: usize = 1_000;
56const FALLBACK_BACKEND: &str = "none";
57const EMBEDDING_REQUEST_MAX_ATTEMPTS: usize = 3;
58const EMBEDDING_REQUEST_BACKOFF_MS: [u64; 2] = [500, 1_000];
59static SEMANTIC_LOCK_ACQUIRE_MUTEX: Mutex<()> = Mutex::new(());
60
61pub struct SemanticIndexLock {
62    _guard: fs_lock::LockGuard,
63}
64
65impl SemanticIndexLock {
66    pub fn acquire(storage_dir: &Path, project_key: &str) -> std::io::Result<Self> {
67        let dir = storage_dir.join("semantic").join(project_key);
68        fs::create_dir_all(&dir)?;
69        let path = dir.join("cache.lock");
70        let _acquire_guard = SEMANTIC_LOCK_ACQUIRE_MUTEX
71            .lock()
72            .map_err(|_| std::io::Error::other("semantic cache lock acquisition mutex poisoned"))?;
73        fs_lock::try_acquire(&path, Duration::from_secs(2))
74            .map(|guard| Self { _guard: guard })
75            .map_err(|error| match error {
76                fs_lock::AcquireError::Timeout => {
77                    std::io::Error::other("timed out acquiring semantic cache lock")
78                }
79                fs_lock::AcquireError::Io(error) => error,
80            })
81    }
82}
83
84#[derive(Debug, Clone, Serialize, Deserialize)]
85pub struct SemanticIndexFingerprint {
86    pub backend: String,
87    pub model: String,
88    #[serde(default)]
89    pub base_url: String,
90    pub dimension: usize,
91    #[serde(default = "default_chunking_version")]
92    pub chunking_version: u32,
93}
94
95fn default_chunking_version() -> u32 {
96    2
97}
98
99impl SemanticIndexFingerprint {
100    fn from_config(config: &SemanticBackendConfig, dimension: usize) -> Self {
101        // Use normalized URL for fingerprinting so cosmetic differences
102        // (e.g. "http://host/v1" vs "http://host/v1/") don't cause rebuilds.
103        let base_url = config
104            .base_url
105            .as_ref()
106            .and_then(|u| normalize_base_url(u).ok())
107            .unwrap_or_else(|| FALLBACK_BACKEND.to_string());
108        Self {
109            backend: config.backend.as_str().to_string(),
110            model: config.model.clone(),
111            base_url,
112            dimension,
113            chunking_version: default_chunking_version(),
114        }
115    }
116
117    pub fn as_string(&self) -> String {
118        serde_json::to_string(self).unwrap_or_else(|_| String::new())
119    }
120
121    fn matches_expected(&self, expected: &str) -> bool {
122        let encoded = self.as_string();
123        !encoded.is_empty() && encoded == expected
124    }
125}
126
127enum SemanticEmbeddingEngine {
128    Fastembed(TextEmbedding),
129    OpenAiCompatible {
130        client: Client,
131        model: String,
132        base_url: String,
133        api_key: Option<String>,
134    },
135    Ollama {
136        client: Client,
137        model: String,
138        base_url: String,
139    },
140}
141
142pub struct SemanticEmbeddingModel {
143    backend: SemanticBackend,
144    model: String,
145    base_url: Option<String>,
146    timeout_ms: u64,
147    max_batch_size: usize,
148    dimension: Option<usize>,
149    engine: SemanticEmbeddingEngine,
150    query_embedding_cache: HashMap<String, Vec<f32>>,
151    query_embedding_cache_order: VecDeque<String>,
152    query_embedding_cache_hits: u64,
153    query_embedding_cache_misses: u64,
154}
155
156pub type EmbeddingModel = SemanticEmbeddingModel;
157
158fn validate_embedding_batch(
159    vectors: &[Vec<f32>],
160    expected_count: usize,
161    context: &str,
162) -> Result<(), String> {
163    if expected_count > 0 && vectors.is_empty() {
164        return Err(format!(
165            "{context} returned no vectors for {expected_count} inputs"
166        ));
167    }
168
169    if vectors.len() != expected_count {
170        return Err(format!(
171            "{context} returned {} vectors for {} inputs",
172            vectors.len(),
173            expected_count
174        ));
175    }
176
177    let Some(first_vector) = vectors.first() else {
178        return Ok(());
179    };
180    let expected_dimension = first_vector.len();
181    validate_embedding_dimension(expected_dimension)
182        .map_err(|error| format!("{context} returned {error}"))?;
183    for (index, vector) in vectors.iter().enumerate() {
184        if vector.len() != expected_dimension {
185            return Err(format!(
186                "{context} returned inconsistent embedding dimensions: vector 0 has length {expected_dimension}, vector {index} has length {}",
187                vector.len()
188            ));
189        }
190    }
191
192    Ok(())
193}
194
195fn validate_embedding_dimension(dimension: usize) -> Result<(), String> {
196    if dimension == 0 || dimension > MAX_DIMENSION {
197        return Err(format!(
198            "invalid embedding dimension: {dimension}; supported range is 1..={MAX_DIMENSION}"
199        ));
200    }
201
202    Ok(())
203}
204
205/// Normalize a base URL: validate scheme and strip trailing slash.
206/// Does NOT perform SSRF/private-IP validation — call
207/// `validate_base_url_no_ssrf` separately when processing user-supplied config.
208fn normalize_base_url(raw: &str) -> Result<String, String> {
209    let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
210    let scheme = parsed.scheme();
211    if scheme != "http" && scheme != "https" {
212        return Err(format!(
213            "unsupported URL scheme '{}' — only http:// and https:// are allowed",
214            scheme
215        ));
216    }
217    Ok(parsed.to_string().trim_end_matches('/').to_string())
218}
219
220/// Validate that a base URL does not point to a private/loopback address.
221/// Call this on user-supplied config (at configure time) to prevent SSRF.
222/// Not called for programmatically constructed configs (e.g. tests).
223///
224/// **Loopback is allowed.** Self-hosted embedding backends (e.g. Ollama at
225/// `http://127.0.0.1:11434`) are a primary use case for `aft_search`. Loopback
226/// addresses by definition cannot be exploited as SSRF targets — they only
227/// reach services on the same machine. Allowing loopback unblocks Ollama at its
228/// default config without opening up SSRF to LAN/intranet services, which
229/// remain rejected.
230///
231/// **mDNS `.local` is rejected.** mDNS hostnames typically resolve to LAN
232/// devices (printers, homelab servers); rejecting them before DNS lookup keeps
233/// the SSRF guard meaningful for non-loopback private networks.
234pub fn validate_base_url_no_ssrf(raw: &str) -> Result<(), String> {
235    use std::net::{IpAddr, ToSocketAddrs};
236
237    let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
238
239    let host = parsed.host_str().unwrap_or("");
240
241    // Loopback hostnames are explicitly allowed. RFC 6761 mandates that
242    // `localhost` and `*.localhost` resolve to loopback;
243    // `localhost.localdomain` is a historical alias used on some Linux
244    // distros. Self-hosted backends like Ollama use these by default.
245    let is_loopback_host =
246        host == "localhost" || host == "localhost.localdomain" || host.ends_with(".localhost");
247    if is_loopback_host {
248        return Ok(());
249    }
250
251    // mDNS hostnames are typically LAN devices, not loopback. Reject before
252    // DNS lookup so users get a clear error rather than a private-IP error.
253    if host.ends_with(".local") {
254        return Err(format!(
255            "base_url host '{host}' is an mDNS name — only loopback (localhost / 127.0.0.1) and public endpoints are allowed"
256        ));
257    }
258
259    // Resolve the hostname. Reject private/link-local/CGNAT IPs but NOT
260    // loopback (which is by definition same-machine and not an SSRF target).
261    let port = parsed.port_or_known_default().unwrap_or(443);
262    let addr_str = format!("{host}:{port}");
263    let addrs: Vec<IpAddr> = addr_str
264        .to_socket_addrs()
265        .map(|iter| iter.map(|sa| sa.ip()).collect())
266        .unwrap_or_default();
267    for ip in &addrs {
268        if is_private_non_loopback_ip(ip) {
269            return Err(format!(
270                "base_url '{raw}' resolves to a private/reserved IP — only loopback (127.0.0.1) and public endpoints are allowed"
271            ));
272        }
273    }
274
275    Ok(())
276}
277
278/// Returns true for IPv4/IPv6 addresses in private/link-local/CGNAT/wildcard
279/// ranges, EXCLUDING loopback (127.0.0.0/8 and ::1). Loopback is considered
280/// safe for SSRF purposes — see [`validate_base_url_no_ssrf`] for rationale.
281fn is_private_non_loopback_ip(ip: &std::net::IpAddr) -> bool {
282    use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
283    match ip {
284        IpAddr::V4(v4) => {
285            let o = v4.octets();
286            // Note: 127.0.0.0/8 (loopback) is intentionally NOT in this set.
287            // 10.0.0.0/8
288            o[0] == 10
289            // 172.16.0.0/12
290            || (o[0] == 172 && (16..=31).contains(&o[1]))
291            // 192.168.0.0/16
292            || (o[0] == 192 && o[1] == 168)
293            // 169.254.0.0/16 link-local
294            || (o[0] == 169 && o[1] == 254)
295            // 100.64.0.0/10 CGNAT
296            || (o[0] == 100 && (64..=127).contains(&o[1]))
297            // 0.0.0.0/8 wildcard
298            || o[0] == 0
299        }
300        IpAddr::V6(v6) => {
301            // Note: ::1 (loopback) is intentionally NOT in this set.
302            let _ = Ipv6Addr::LOCALHOST; // touch to silence unused-import lints in some builds
303                                         // fe80::/10 link-local
304            (v6.segments()[0] & 0xffc0) == 0xfe80
305            // fc00::/7 unique-local
306            || (v6.segments()[0] & 0xfe00) == 0xfc00
307            // ::ffff:0:0/96 IPv4-mapped — check the embedded IPv4
308            || (v6.segments()[0] == 0 && v6.segments()[1] == 0
309                && v6.segments()[2] == 0 && v6.segments()[3] == 0
310                && v6.segments()[4] == 0 && v6.segments()[5] == 0xffff
311                && {
312                    let [a, b] = v6.segments()[6..8] else { return false; };
313                    let ipv4 = Ipv4Addr::new((a >> 8) as u8, (a & 0xff) as u8, (b >> 8) as u8, (b & 0xff) as u8);
314                    is_private_non_loopback_ip(&IpAddr::V4(ipv4))
315                })
316        }
317    }
318}
319
320fn build_openai_embeddings_endpoint(base_url: &str) -> String {
321    if base_url.ends_with("/v1") {
322        format!("{base_url}{DEFAULT_OPENAI_EMBEDDING_PATH}")
323    } else {
324        format!("{base_url}/v1{}", DEFAULT_OPENAI_EMBEDDING_PATH)
325    }
326}
327
328fn build_ollama_embeddings_endpoint(base_url: &str) -> String {
329    if base_url.ends_with("/api") {
330        format!("{base_url}/embed")
331    } else {
332        format!("{base_url}{DEFAULT_OLLAMA_EMBEDDING_PATH}")
333    }
334}
335
336fn normalize_api_key(value: Option<String>) -> Option<String> {
337    value.and_then(|token| {
338        let token = token.trim();
339        if token.is_empty() {
340            None
341        } else {
342            Some(token.to_string())
343        }
344    })
345}
346
347fn is_retryable_embedding_status(status: reqwest::StatusCode) -> bool {
348    status.is_server_error() || status == reqwest::StatusCode::TOO_MANY_REQUESTS
349}
350
351fn is_retryable_embedding_error(error: &reqwest::Error) -> bool {
352    error.is_connect()
353}
354
355fn sleep_before_embedding_retry(attempt_index: usize) {
356    if let Some(delay_ms) = EMBEDDING_REQUEST_BACKOFF_MS.get(attempt_index) {
357        std::thread::sleep(Duration::from_millis(*delay_ms));
358    }
359}
360
361fn send_embedding_request<F>(mut make_request: F, backend_label: &str) -> Result<String, String>
362where
363    F: FnMut() -> reqwest::blocking::RequestBuilder,
364{
365    for attempt_index in 0..EMBEDDING_REQUEST_MAX_ATTEMPTS {
366        let last_attempt = attempt_index + 1 == EMBEDDING_REQUEST_MAX_ATTEMPTS;
367
368        let response = match make_request().send() {
369            Ok(response) => response,
370            Err(error) => {
371                if !last_attempt && is_retryable_embedding_error(&error) {
372                    sleep_before_embedding_retry(attempt_index);
373                    continue;
374                }
375                return Err(format!("{backend_label} request failed: {error}"));
376            }
377        };
378
379        let status = response.status();
380        let raw = match response.text() {
381            Ok(raw) => raw,
382            Err(error) => {
383                if !last_attempt && is_retryable_embedding_error(&error) {
384                    sleep_before_embedding_retry(attempt_index);
385                    continue;
386                }
387                return Err(format!("{backend_label} response read failed: {error}"));
388            }
389        };
390
391        if status.is_success() {
392            return Ok(raw);
393        }
394
395        if !last_attempt && is_retryable_embedding_status(status) {
396            sleep_before_embedding_retry(attempt_index);
397            continue;
398        }
399
400        return Err(format!(
401            "{backend_label} request failed (HTTP {}): {}",
402            status, raw
403        ));
404    }
405
406    unreachable!("embedding request retries exhausted without returning")
407}
408
409impl SemanticEmbeddingModel {
410    pub fn from_config(config: &SemanticBackendConfig) -> Result<Self, String> {
411        let timeout_ms = if config.timeout_ms == 0 {
412            DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS
413        } else {
414            config.timeout_ms
415        };
416
417        let max_batch_size = if config.max_batch_size == 0 {
418            DEFAULT_MAX_BATCH_SIZE
419        } else {
420            config.max_batch_size
421        };
422
423        let api_key_env = normalize_api_key(config.api_key_env.clone());
424        let model = config.model.clone();
425
426        let client = Client::builder()
427            .timeout(Duration::from_millis(timeout_ms))
428            .redirect(reqwest::redirect::Policy::none())
429            .build()
430            .map_err(|error| format!("failed to configure embedding client: {error}"))?;
431
432        let engine = match config.backend {
433            SemanticBackend::Fastembed => {
434                SemanticEmbeddingEngine::Fastembed(initialize_text_embedding(&model)?)
435            }
436            SemanticBackend::OpenAiCompatible => {
437                let raw = config.base_url.as_ref().ok_or_else(|| {
438                    "base_url is required for openai_compatible backend".to_string()
439                })?;
440                let base_url = normalize_base_url(raw)?;
441
442                let api_key = match api_key_env {
443                    Some(var_name) => Some(env::var(&var_name).map_err(|_| {
444                        format!("missing api_key_env '{var_name}' for openai_compatible backend")
445                    })?),
446                    None => None,
447                };
448
449                SemanticEmbeddingEngine::OpenAiCompatible {
450                    client,
451                    model,
452                    base_url,
453                    api_key,
454                }
455            }
456            SemanticBackend::Ollama => {
457                let raw = config
458                    .base_url
459                    .as_ref()
460                    .ok_or_else(|| "base_url is required for ollama backend".to_string())?;
461                let base_url = normalize_base_url(raw)?;
462
463                SemanticEmbeddingEngine::Ollama {
464                    client,
465                    model,
466                    base_url,
467                }
468            }
469        };
470
471        Ok(Self {
472            backend: config.backend,
473            model: config.model.clone(),
474            base_url: config.base_url.clone(),
475            timeout_ms,
476            max_batch_size,
477            dimension: None,
478            engine,
479            query_embedding_cache: HashMap::new(),
480            query_embedding_cache_order: VecDeque::new(),
481            query_embedding_cache_hits: 0,
482            query_embedding_cache_misses: 0,
483        })
484    }
485
486    pub fn backend(&self) -> SemanticBackend {
487        self.backend
488    }
489
490    pub fn model(&self) -> &str {
491        &self.model
492    }
493
494    pub fn base_url(&self) -> Option<&str> {
495        self.base_url.as_deref()
496    }
497
498    pub fn max_batch_size(&self) -> usize {
499        self.max_batch_size
500    }
501
502    pub fn timeout_ms(&self) -> u64 {
503        self.timeout_ms
504    }
505
506    pub fn fingerprint(
507        &mut self,
508        config: &SemanticBackendConfig,
509    ) -> Result<SemanticIndexFingerprint, String> {
510        let dimension = self.dimension()?;
511        Ok(SemanticIndexFingerprint::from_config(config, dimension))
512    }
513
514    pub fn dimension(&mut self) -> Result<usize, String> {
515        if let Some(dimension) = self.dimension {
516            return Ok(dimension);
517        }
518
519        let dimension = match &mut self.engine {
520            SemanticEmbeddingEngine::Fastembed(model) => {
521                let vectors = model
522                    .embed(vec!["semantic index fingerprint probe".to_string()], None)
523                    .map_err(|error| format_embedding_init_error(error.to_string()))?;
524                vectors
525                    .first()
526                    .map(|v| v.len())
527                    .ok_or_else(|| "embedding backend returned no vectors".to_string())?
528            }
529            SemanticEmbeddingEngine::OpenAiCompatible { .. } => {
530                let vectors =
531                    self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
532                vectors
533                    .first()
534                    .map(|v| v.len())
535                    .ok_or_else(|| "embedding backend returned no vectors".to_string())?
536            }
537            SemanticEmbeddingEngine::Ollama { .. } => {
538                let vectors =
539                    self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
540                vectors
541                    .first()
542                    .map(|v| v.len())
543                    .ok_or_else(|| "embedding backend returned no vectors".to_string())?
544            }
545        };
546
547        self.dimension = Some(dimension);
548        Ok(dimension)
549    }
550
551    pub fn embed(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
552        self.embed_texts(texts)
553    }
554
555    pub fn embed_query_cached(&mut self, query: &str) -> Result<Vec<f32>, String> {
556        if let Some(vector) = self.query_embedding_cache.get(query) {
557            self.query_embedding_cache_hits += 1;
558            return Ok(vector.clone());
559        }
560
561        self.query_embedding_cache_misses += 1;
562        let embeddings = self.embed_texts(vec![query.to_string()])?;
563        let vector = embeddings
564            .first()
565            .cloned()
566            .ok_or_else(|| "embedding model returned no query vector".to_string())?;
567
568        if self.query_embedding_cache.len() >= QUERY_EMBEDDING_CACHE_CAP {
569            if let Some(oldest) = self.query_embedding_cache_order.pop_front() {
570                self.query_embedding_cache.remove(&oldest);
571            }
572        }
573        self.query_embedding_cache
574            .insert(query.to_string(), vector.clone());
575        self.query_embedding_cache_order
576            .push_back(query.to_string());
577
578        Ok(vector)
579    }
580
581    pub fn query_embedding_cache_stats(&self) -> (u64, u64, usize) {
582        (
583            self.query_embedding_cache_hits,
584            self.query_embedding_cache_misses,
585            self.query_embedding_cache.len(),
586        )
587    }
588
589    fn embed_texts(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
590        match &mut self.engine {
591            SemanticEmbeddingEngine::Fastembed(model) => model
592                .embed(texts, None::<usize>)
593                .map_err(|error| format_embedding_init_error(error.to_string()))
594                .map_err(|error| format!("failed to embed batch: {error}")),
595            SemanticEmbeddingEngine::OpenAiCompatible {
596                client,
597                model,
598                base_url,
599                api_key,
600            } => {
601                let expected_text_count = texts.len();
602                let endpoint = build_openai_embeddings_endpoint(base_url);
603                let body = serde_json::json!({
604                    "input": texts,
605                    "model": model,
606                });
607
608                let raw = send_embedding_request(
609                    || {
610                        // `.json(&body)` sets Content-Type: application/json
611                        // automatically. Do NOT add `.header("Content-Type",
612                        // "application/json")` afterwards — RequestBuilder::header()
613                        // calls HeaderMap::append, which produces TWO Content-Type
614                        // headers on the wire. OpenAI's /v1/embeddings endpoint
615                        // treats duplicate Content-Type as malformed and rejects
616                        // the body with 400 "you must provide a model parameter"
617                        // even when `model` is set. Verified end-to-end against
618                        // api.openai.com. See issue #36.
619                        let mut request = client.post(&endpoint).json(&body);
620
621                        if let Some(api_key) = api_key {
622                            request = request.header("Authorization", format!("Bearer {api_key}"));
623                        }
624
625                        request
626                    },
627                    "openai compatible",
628                )?;
629
630                #[derive(Deserialize)]
631                struct OpenAiResponse {
632                    data: Vec<OpenAiEmbeddingResult>,
633                }
634
635                #[derive(Deserialize)]
636                struct OpenAiEmbeddingResult {
637                    embedding: Vec<f32>,
638                    index: Option<u32>,
639                }
640
641                let parsed: OpenAiResponse = serde_json::from_str(&raw)
642                    .map_err(|error| format!("invalid openai compatible response: {error}"))?;
643                if parsed.data.len() != expected_text_count {
644                    return Err(format!(
645                        "openai compatible response returned {} embeddings for {} inputs",
646                        parsed.data.len(),
647                        expected_text_count
648                    ));
649                }
650
651                let mut vectors = vec![Vec::new(); parsed.data.len()];
652                for (i, item) in parsed.data.into_iter().enumerate() {
653                    let index = item.index.unwrap_or(i as u32) as usize;
654                    if index >= vectors.len() {
655                        return Err(
656                            "openai compatible response contains invalid vector index".to_string()
657                        );
658                    }
659                    vectors[index] = item.embedding;
660                }
661
662                for vector in &vectors {
663                    if vector.is_empty() {
664                        return Err(
665                            "openai compatible response contained missing vectors".to_string()
666                        );
667                    }
668                }
669
670                self.dimension = vectors.first().map(Vec::len);
671                Ok(vectors)
672            }
673            SemanticEmbeddingEngine::Ollama {
674                client,
675                model,
676                base_url,
677            } => {
678                let expected_text_count = texts.len();
679                let endpoint = build_ollama_embeddings_endpoint(base_url);
680
681                #[derive(Serialize)]
682                struct OllamaPayload<'a> {
683                    model: &'a str,
684                    input: Vec<String>,
685                }
686
687                let payload = OllamaPayload {
688                    model,
689                    input: texts,
690                };
691
692                let raw = send_embedding_request(
693                    || {
694                        // `.json(&payload)` sets Content-Type automatically.
695                        // Same duplicate-header trap as the OpenAI branch above
696                        // — most Ollama servers tolerate it, but the
697                        // single-Content-Type form is the correct one.
698                        client.post(&endpoint).json(&payload)
699                    },
700                    "ollama",
701                )?;
702
703                #[derive(Deserialize)]
704                struct OllamaResponse {
705                    embeddings: Vec<Vec<f32>>,
706                }
707
708                let parsed: OllamaResponse = serde_json::from_str(&raw)
709                    .map_err(|error| format!("invalid ollama response: {error}"))?;
710                if parsed.embeddings.is_empty() {
711                    return Err("ollama response returned no embeddings".to_string());
712                }
713                if parsed.embeddings.len() != expected_text_count {
714                    return Err(format!(
715                        "ollama response returned {} embeddings for {} inputs",
716                        parsed.embeddings.len(),
717                        expected_text_count
718                    ));
719                }
720
721                let vectors = parsed.embeddings;
722                for vector in &vectors {
723                    if vector.is_empty() {
724                        return Err("ollama response contained empty embeddings".to_string());
725                    }
726                }
727
728                self.dimension = vectors.first().map(Vec::len);
729                Ok(vectors)
730            }
731        }
732    }
733}
734
735/// Pre-validate ONNX Runtime by attempting a raw dlopen before ort touches it.
736/// This catches broken/incompatible .so files without risking a panic in the ort crate.
737/// Also checks the runtime version via OrtGetApiBase if available.
738pub fn pre_validate_onnx_runtime() -> Result<(), String> {
739    let dylib_path = std::env::var("ORT_DYLIB_PATH").ok();
740
741    #[cfg(any(target_os = "linux", target_os = "macos"))]
742    {
743        #[cfg(target_os = "linux")]
744        let default_name = "libonnxruntime.so";
745        #[cfg(target_os = "macos")]
746        let default_name = "libonnxruntime.dylib";
747
748        let lib_name = dylib_path.as_deref().unwrap_or(default_name);
749
750        unsafe {
751            let c_name = std::ffi::CString::new(lib_name)
752                .map_err(|e| format!("invalid library path: {}", e))?;
753            let handle = libc::dlopen(c_name.as_ptr(), libc::RTLD_NOW);
754            if handle.is_null() {
755                let err = libc::dlerror();
756                let msg = if err.is_null() {
757                    "unknown dlopen error".to_string()
758                } else {
759                    std::ffi::CStr::from_ptr(err).to_string_lossy().into_owned()
760                };
761                return Err(format!(
762                    "ONNX Runtime not found. dlopen('{}') failed: {}. \
763                     Run `npx @cortexkit/aft doctor` to diagnose.",
764                    lib_name, msg
765                ));
766            }
767
768            // Try to detect the runtime version from the file path or soname.
769            // libonnxruntime.so.1.19.0, libonnxruntime.1.24.4.dylib, etc.
770            let detected_version = detect_ort_version_from_path(lib_name);
771
772            libc::dlclose(handle);
773
774            // Check version compatibility — we need 1.24.x
775            if let Some(ref version) = detected_version {
776                let parts: Vec<&str> = version.split('.').collect();
777                if let (Some(major), Some(minor)) = (
778                    parts.first().and_then(|s| s.parse::<u32>().ok()),
779                    parts.get(1).and_then(|s| s.parse::<u32>().ok()),
780                ) {
781                    if major != 1 || minor < 20 {
782                        return Err(format_ort_version_mismatch(version, lib_name));
783                    }
784                }
785            }
786        }
787    }
788
789    #[cfg(target_os = "windows")]
790    {
791        // On Windows, skip pre-validation — let ort handle LoadLibrary
792        let _ = dylib_path;
793    }
794
795    Ok(())
796}
797
798/// Try to extract the ORT version from the library filename or resolved symlink.
799/// Examples: "libonnxruntime.so.1.19.0" → "1.19.0", "libonnxruntime.1.24.4.dylib" → "1.24.4"
800#[cfg(any(test, target_os = "linux", target_os = "macos"))]
801fn detect_ort_version_from_path(lib_path: &str) -> Option<String> {
802    let path = std::path::Path::new(lib_path);
803
804    // Try the path as given, then follow symlinks
805    for candidate in [Some(path.to_path_buf()), std::fs::canonicalize(path).ok()]
806        .into_iter()
807        .flatten()
808    {
809        if let Some(name) = candidate.file_name().and_then(|n| n.to_str()) {
810            if let Some(version) = extract_version_from_filename(name) {
811                return Some(version);
812            }
813        }
814    }
815
816    // Also check for versioned siblings in the same directory
817    if let Some(parent) = path.parent() {
818        if let Ok(entries) = std::fs::read_dir(parent) {
819            for entry in entries.flatten() {
820                if let Some(name) = entry.file_name().to_str() {
821                    if name.starts_with("libonnxruntime") {
822                        if let Some(version) = extract_version_from_filename(name) {
823                            return Some(version);
824                        }
825                    }
826                }
827            }
828        }
829    }
830
831    None
832}
833
834/// Extract version from filenames like "libonnxruntime.so.1.19.0" or "libonnxruntime.1.24.4.dylib"
835#[cfg(any(test, target_os = "linux", target_os = "macos"))]
836fn extract_version_from_filename(name: &str) -> Option<String> {
837    // Match patterns: .so.X.Y.Z or .X.Y.Z.dylib or .X.Y.Z.so
838    let re = regex::Regex::new(r"(\d+\.\d+\.\d+)").ok()?;
839    re.find(name).map(|m| m.as_str().to_string())
840}
841
842#[cfg(any(test, target_os = "linux", target_os = "macos"))]
843fn suggest_removal_command(lib_path: &str) -> String {
844    if lib_path.starts_with("/usr/local/lib")
845        || lib_path == "libonnxruntime.so"
846        || lib_path == "libonnxruntime.dylib"
847    {
848        #[cfg(target_os = "linux")]
849        return "   sudo rm /usr/local/lib/libonnxruntime* && sudo ldconfig".to_string();
850        #[cfg(target_os = "macos")]
851        return "   sudo rm /usr/local/lib/libonnxruntime*".to_string();
852        #[cfg(target_os = "windows")]
853        return "   Delete the ONNX Runtime DLL from your PATH".to_string();
854    }
855    format!("   rm '{}'", lib_path)
856}
857
858/// Build the user-facing error message for an incompatible ONNX Runtime
859/// install. Extracted as a pure helper so we can unit-test the wording
860/// stability — the auto-fix recommendation must always come first because
861/// it's the only safe option, and the system-rm step must remain present
862/// because some users prefer the system-wide cleanup path.
863#[cfg(any(test, target_os = "linux", target_os = "macos"))]
864pub(crate) fn format_ort_version_mismatch(version: &str, lib_name: &str) -> String {
865    format!(
866        "ONNX Runtime version mismatch: found v{} at '{}', but AFT requires v1.20+. \
867         Solutions:\n\
868         1. Auto-fix (recommended): run `npx @cortexkit/aft doctor --fix`. \
869         This downloads AFT-managed ONNX Runtime v1.24 into AFT's storage and \
870         configures the bridge to load it instead of the system library — no \
871         changes to '{}'.\n\
872         2. Remove the old library and restart (AFT auto-downloads the correct version on next start):\n\
873         {}\n\
874         3. Or install ONNX Runtime 1.24 system-wide: https://github.com/microsoft/onnxruntime/releases/tag/v1.24.0\n\
875         4. Run `npx @cortexkit/aft doctor` for full diagnostics.",
876        version,
877        lib_name,
878        lib_name,
879        suggest_removal_command(lib_name),
880    )
881}
882
883pub fn initialize_text_embedding(model: &str) -> Result<TextEmbedding, String> {
884    // Pre-validate before ort can panic on a bad library
885    pre_validate_onnx_runtime()?;
886
887    let selected_model = match model {
888        "all-MiniLM-L6-v2" | "all-minilm-l6-v2" => FastembedEmbeddingModel::AllMiniLML6V2,
889        _ => {
890            return Err(format!(
891                "unsupported fastembed model '{}'. Supported: all-MiniLM-L6-v2",
892                model
893            ))
894        }
895    };
896
897    TextEmbedding::try_new(InitOptions::new(selected_model)).map_err(format_embedding_init_error)
898}
899
900pub fn is_onnx_runtime_unavailable(message: &str) -> bool {
901    if message.trim_start().starts_with("ONNX Runtime not found.") {
902        return true;
903    }
904
905    let message = message.to_ascii_lowercase();
906    let mentions_onnx_runtime = ["onnx runtime", "onnxruntime", "libonnxruntime"]
907        .iter()
908        .any(|pattern| message.contains(pattern));
909    let mentions_dynamic_load_failure = [
910        "shared library",
911        "dynamic library",
912        "failed to load",
913        "could not load",
914        "unable to load",
915        "dlopen",
916        "loadlibrary",
917        "no such file",
918        "not found",
919    ]
920    .iter()
921    .any(|pattern| message.contains(pattern));
922
923    mentions_onnx_runtime && mentions_dynamic_load_failure
924}
925
926fn format_embedding_init_error(error: impl Display) -> String {
927    let message = error.to_string();
928
929    if is_onnx_runtime_unavailable(&message) {
930        return format!("{ONNX_RUNTIME_INSTALL_HINT} Original error: {message}");
931    }
932
933    format!("failed to initialize semantic embedding model: {message}")
934}
935
936/// A chunk of code ready for embedding — derived from a Symbol with context enrichment
937#[derive(Debug, Clone)]
938pub struct SemanticChunk {
939    /// Absolute file path
940    pub file: PathBuf,
941    /// Symbol name
942    pub name: String,
943    /// Symbol kind (function, class, struct, etc.)
944    pub kind: SymbolKind,
945    /// Line range (0-based internally, inclusive)
946    pub start_line: u32,
947    pub end_line: u32,
948    /// Whether the symbol is exported
949    pub exported: bool,
950    /// The enriched text that gets embedded (scope + signature + body snippet)
951    pub embed_text: String,
952    /// Short code snippet for display in results
953    pub snippet: String,
954}
955
956/// A stored embedding entry — chunk metadata + vector
957#[derive(Debug)]
958struct EmbeddingEntry {
959    chunk: SemanticChunk,
960    vector: Vec<f32>,
961}
962
963/// The semantic index — stores embeddings for all symbols in a project
964#[derive(Debug)]
965pub struct SemanticIndex {
966    entries: Vec<EmbeddingEntry>,
967    /// Track which files are indexed and their mtime for staleness detection
968    file_mtimes: HashMap<PathBuf, SystemTime>,
969    /// Track indexed file sizes alongside mtimes for staleness detection
970    file_sizes: HashMap<PathBuf, u64>,
971    file_hashes: HashMap<PathBuf, blake3::Hash>,
972    /// Embedding dimension (384 for MiniLM-L6-v2)
973    dimension: usize,
974    fingerprint: Option<SemanticIndexFingerprint>,
975    project_root: PathBuf,
976}
977
978#[derive(Debug, Clone, Copy)]
979struct IndexedFileMetadata {
980    mtime: SystemTime,
981    size: u64,
982    content_hash: blake3::Hash,
983}
984
985/// Result of an incremental refresh of the semantic index. Counts are file
986/// counts; `total_processed` is the number of current/deleted files considered.
987#[derive(Debug, Default, Clone, Copy)]
988pub struct RefreshSummary {
989    pub changed: usize,
990    pub added: usize,
991    pub deleted: usize,
992    pub total_processed: usize,
993}
994
995impl RefreshSummary {
996    /// True when no files were touched.
997    pub fn is_noop(&self) -> bool {
998        self.changed == 0 && self.added == 0 && self.deleted == 0
999    }
1000}
1001
1002/// Search result from a semantic query
1003#[derive(Debug, Clone)]
1004pub struct SemanticResult {
1005    pub file: PathBuf,
1006    pub name: String,
1007    pub kind: SymbolKind,
1008    pub start_line: u32,
1009    pub end_line: u32,
1010    pub exported: bool,
1011    pub snippet: String,
1012    pub score: f32,
1013    pub source: &'static str,
1014}
1015
1016impl SemanticIndex {
1017    pub fn new(project_root: PathBuf, dimension: usize) -> Self {
1018        debug_assert!(project_root.is_absolute());
1019        Self {
1020            entries: Vec::new(),
1021            file_mtimes: HashMap::new(),
1022            file_sizes: HashMap::new(),
1023            file_hashes: HashMap::new(),
1024            dimension,
1025            fingerprint: None,
1026            project_root,
1027        }
1028    }
1029
1030    /// Number of embedded symbol entries.
1031    pub fn entry_count(&self) -> usize {
1032        self.entries.len()
1033    }
1034
1035    /// Human-readable status label for the index.
1036    pub fn status_label(&self) -> &'static str {
1037        if self.entries.is_empty() {
1038            "empty"
1039        } else {
1040            "ready"
1041        }
1042    }
1043
1044    fn collect_chunks(
1045        project_root: &Path,
1046        files: &[PathBuf],
1047    ) -> (Vec<SemanticChunk>, HashMap<PathBuf, IndexedFileMetadata>) {
1048        let per_file: Vec<(
1049            PathBuf,
1050            Result<(IndexedFileMetadata, Vec<SemanticChunk>), String>,
1051        )> = files
1052            .par_iter()
1053            .map_init(HashMap::new, |parsers, file| {
1054                let result = collect_file_metadata(file).and_then(|metadata| {
1055                    collect_file_chunks(project_root, file, parsers)
1056                        .map(|chunks| (metadata, chunks))
1057                });
1058                (file.clone(), result)
1059            })
1060            .collect();
1061
1062        let mut chunks: Vec<SemanticChunk> = Vec::new();
1063        let mut file_metadata: HashMap<PathBuf, IndexedFileMetadata> = HashMap::new();
1064
1065        for (file, result) in per_file {
1066            match result {
1067                Ok((metadata, file_chunks)) => {
1068                    file_metadata.insert(file, metadata);
1069                    chunks.extend(file_chunks);
1070                }
1071                Err(error) => {
1072                    // "unsupported file extension" is expected for non-code files
1073                    // (json, xml, .gitignore, etc.) that get included in the
1074                    // project walk. Pre-fix this was swallowed by .unwrap_or_default();
1075                    // we now skip silently to keep the log clean. Only real read/parse
1076                    // errors are worth surfacing.
1077                    if error == "unsupported file extension" {
1078                        continue;
1079                    }
1080                    slog_warn!(
1081                        "failed to collect semantic chunks for {}: {}",
1082                        file.display(),
1083                        error
1084                    );
1085                }
1086            }
1087        }
1088
1089        (chunks, file_metadata)
1090    }
1091
1092    fn build_from_chunks<F, P>(
1093        project_root: &Path,
1094        chunks: Vec<SemanticChunk>,
1095        file_metadata: HashMap<PathBuf, IndexedFileMetadata>,
1096        embed_fn: &mut F,
1097        max_batch_size: usize,
1098        mut progress: Option<&mut P>,
1099    ) -> Result<Self, String>
1100    where
1101        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1102        P: FnMut(usize, usize),
1103    {
1104        debug_assert!(project_root.is_absolute());
1105        let total_chunks = chunks.len();
1106
1107        if chunks.is_empty() {
1108            return Ok(Self {
1109                entries: Vec::new(),
1110                file_mtimes: file_metadata
1111                    .iter()
1112                    .map(|(path, metadata)| (path.clone(), metadata.mtime))
1113                    .collect(),
1114                file_sizes: file_metadata
1115                    .iter()
1116                    .map(|(path, metadata)| (path.clone(), metadata.size))
1117                    .collect(),
1118                file_hashes: file_metadata
1119                    .into_iter()
1120                    .map(|(path, metadata)| (path, metadata.content_hash))
1121                    .collect(),
1122                dimension: DEFAULT_DIMENSION,
1123                fingerprint: None,
1124                project_root: project_root.to_path_buf(),
1125            });
1126        }
1127
1128        // Embed in batches
1129        let mut entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1130        let mut expected_dimension: Option<usize> = None;
1131        let batch_size = max_batch_size.max(1);
1132        for batch_start in (0..chunks.len()).step_by(batch_size) {
1133            let batch_end = (batch_start + batch_size).min(chunks.len());
1134            let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1135                .iter()
1136                .map(|c| c.embed_text.clone())
1137                .collect();
1138
1139            let vectors = embed_fn(batch_texts)?;
1140            validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1141
1142            // Track consistent dimension across all batches
1143            if let Some(dim) = vectors.first().map(|v| v.len()) {
1144                match expected_dimension {
1145                    None => expected_dimension = Some(dim),
1146                    Some(expected) if dim != expected => {
1147                        return Err(format!(
1148                            "embedding dimension changed across batches: expected {expected}, got {dim}"
1149                        ));
1150                    }
1151                    _ => {}
1152                }
1153            }
1154
1155            for (i, vector) in vectors.into_iter().enumerate() {
1156                let chunk_idx = batch_start + i;
1157                entries.push(EmbeddingEntry {
1158                    chunk: chunks[chunk_idx].clone(),
1159                    vector,
1160                });
1161            }
1162
1163            if let Some(callback) = progress.as_mut() {
1164                callback(entries.len(), total_chunks);
1165            }
1166        }
1167
1168        let dimension = entries
1169            .first()
1170            .map(|e| e.vector.len())
1171            .unwrap_or(DEFAULT_DIMENSION);
1172
1173        Ok(Self {
1174            entries,
1175            file_mtimes: file_metadata
1176                .iter()
1177                .map(|(path, metadata)| (path.clone(), metadata.mtime))
1178                .collect(),
1179            file_sizes: file_metadata
1180                .iter()
1181                .map(|(path, metadata)| (path.clone(), metadata.size))
1182                .collect(),
1183            file_hashes: file_metadata
1184                .into_iter()
1185                .map(|(path, metadata)| (path, metadata.content_hash))
1186                .collect(),
1187            dimension,
1188            fingerprint: None,
1189            project_root: project_root.to_path_buf(),
1190        })
1191    }
1192
1193    /// Build the semantic index from a set of files using the provided embedding function.
1194    /// `embed_fn` takes a batch of texts and returns a batch of embedding vectors.
1195    pub fn build<F>(
1196        project_root: &Path,
1197        files: &[PathBuf],
1198        embed_fn: &mut F,
1199        max_batch_size: usize,
1200    ) -> Result<Self, String>
1201    where
1202        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1203    {
1204        let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1205        Self::build_from_chunks(
1206            project_root,
1207            chunks,
1208            file_mtimes,
1209            embed_fn,
1210            max_batch_size,
1211            Option::<&mut fn(usize, usize)>::None,
1212        )
1213    }
1214
1215    /// Build the semantic index and report embedding progress using entry counts.
1216    pub fn build_with_progress<F, P>(
1217        project_root: &Path,
1218        files: &[PathBuf],
1219        embed_fn: &mut F,
1220        max_batch_size: usize,
1221        progress: &mut P,
1222    ) -> Result<Self, String>
1223    where
1224        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1225        P: FnMut(usize, usize),
1226    {
1227        let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1228        let total_chunks = chunks.len();
1229        progress(0, total_chunks);
1230        Self::build_from_chunks(
1231            project_root,
1232            chunks,
1233            file_mtimes,
1234            embed_fn,
1235            max_batch_size,
1236            Some(progress),
1237        )
1238    }
1239
1240    /// Incrementally refresh entries for changed/new files only, preserving cached
1241    /// embeddings for unchanged files. Used when loading the index from disk and
1242    /// finding that a small fraction of files have moved on, deleted, or appeared.
1243    ///
1244    /// Returns `RefreshSummary` describing what changed. On success, `self` is
1245    /// mutated in place and remains a valid index.
1246    ///
1247    /// `current_files` is the full set of files the project considers indexable
1248    /// (typically `walk_project_files(...)`). Files in the cache that are no
1249    /// longer in this set are treated as deleted.
1250    pub fn refresh_stale_files<F, P>(
1251        &mut self,
1252        project_root: &Path,
1253        current_files: &[PathBuf],
1254        embed_fn: &mut F,
1255        max_batch_size: usize,
1256        progress: &mut P,
1257    ) -> Result<RefreshSummary, String>
1258    where
1259        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1260        P: FnMut(usize, usize),
1261    {
1262        self.backfill_missing_file_sizes();
1263
1264        // 1. Bucket files into deleted / changed / added.
1265        let current_set: HashSet<&Path> = current_files.iter().map(PathBuf::as_path).collect();
1266        let total_processed = current_set.len() + self.file_mtimes.len()
1267            - self
1268                .file_mtimes
1269                .keys()
1270                .filter(|path| current_set.contains(path.as_path()))
1271                .count();
1272
1273        // Files in cache that disappeared from disk OR are no longer in the
1274        // walked set. Both cases need their entries dropped.
1275        let mut deleted: Vec<PathBuf> = Vec::new();
1276        let mut changed: Vec<PathBuf> = Vec::new();
1277        let indexed_paths: Vec<PathBuf> = self.file_mtimes.keys().cloned().collect();
1278        for indexed_path in &indexed_paths {
1279            if !current_set.contains(indexed_path.as_path()) {
1280                deleted.push(indexed_path.clone());
1281                continue;
1282            }
1283            let cached = match (
1284                self.file_mtimes.get(indexed_path),
1285                self.file_sizes.get(indexed_path),
1286                self.file_hashes.get(indexed_path),
1287            ) {
1288                (Some(mtime), Some(size), Some(hash)) => Some(FileFreshness {
1289                    mtime: *mtime,
1290                    size: *size,
1291                    content_hash: *hash,
1292                }),
1293                _ => None,
1294            };
1295            match cached.map(|freshness| cache_freshness::verify_file(indexed_path, &freshness)) {
1296                Some(FreshnessVerdict::HotFresh) => {}
1297                Some(FreshnessVerdict::ContentFresh {
1298                    new_mtime,
1299                    new_size,
1300                }) => {
1301                    self.file_mtimes.insert(indexed_path.clone(), new_mtime);
1302                    self.file_sizes.insert(indexed_path.clone(), new_size);
1303                }
1304                Some(FreshnessVerdict::Stale | FreshnessVerdict::Deleted) | None => {
1305                    changed.push(indexed_path.clone());
1306                }
1307            }
1308        }
1309
1310        // Files in walk that were never indexed.
1311        let mut added: Vec<PathBuf> = Vec::new();
1312        for path in current_files {
1313            if !self.file_mtimes.contains_key(path) {
1314                added.push(path.clone());
1315            }
1316        }
1317
1318        // Fast path: nothing to do.
1319        if deleted.is_empty() && changed.is_empty() && added.is_empty() {
1320            progress(0, 0);
1321            return Ok(RefreshSummary {
1322                total_processed,
1323                ..RefreshSummary::default()
1324            });
1325        }
1326
1327        // 2. Drop entries for deleted files immediately. Changed files are only
1328        //    replaced after successful re-extraction + embedding so transient
1329        //    read/parse errors keep the stale-but-valid cache entry.
1330        if !deleted.is_empty() {
1331            let deleted_set: HashSet<&Path> = deleted.iter().map(PathBuf::as_path).collect();
1332            self.entries
1333                .retain(|entry| !deleted_set.contains(entry.chunk.file.as_path()));
1334            for path in &deleted {
1335                self.file_mtimes.remove(path);
1336                self.file_sizes.remove(path);
1337                self.file_hashes.remove(path);
1338            }
1339        }
1340
1341        // 3. Embed the changed + added set, if any.
1342        let mut to_embed: Vec<PathBuf> = Vec::with_capacity(changed.len() + added.len());
1343        to_embed.extend(changed.iter().cloned());
1344        to_embed.extend(added.iter().cloned());
1345
1346        if to_embed.is_empty() {
1347            // Only deletions happened.
1348            progress(0, 0);
1349            return Ok(RefreshSummary {
1350                changed: 0,
1351                added: 0,
1352                deleted: deleted.len(),
1353                total_processed,
1354            });
1355        }
1356
1357        let (chunks, fresh_metadata) = Self::collect_chunks(project_root, &to_embed);
1358
1359        if chunks.is_empty() {
1360            progress(0, 0);
1361            let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1362            if !successful_files.is_empty() {
1363                self.entries
1364                    .retain(|entry| !successful_files.contains(&entry.chunk.file));
1365            }
1366            let changed_count = changed
1367                .iter()
1368                .filter(|path| successful_files.contains(*path))
1369                .count();
1370            let added_count = added
1371                .iter()
1372                .filter(|path| successful_files.contains(*path))
1373                .count();
1374            for (file, metadata) in fresh_metadata {
1375                self.file_mtimes.insert(file.clone(), metadata.mtime);
1376                self.file_sizes.insert(file.clone(), metadata.size);
1377                self.file_hashes.insert(file.clone(), metadata.content_hash);
1378            }
1379            return Ok(RefreshSummary {
1380                changed: changed_count,
1381                added: added_count,
1382                deleted: deleted.len(),
1383                total_processed,
1384            });
1385        }
1386
1387        // 4. Embed in batches and dimension-check against the existing index.
1388        let total_chunks = chunks.len();
1389        progress(0, total_chunks);
1390        let batch_size = max_batch_size.max(1);
1391        let existing_dimension = if self.entries.is_empty() {
1392            None
1393        } else {
1394            Some(self.dimension)
1395        };
1396        let mut new_entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1397        let mut observed_dimension: Option<usize> = existing_dimension;
1398
1399        for batch_start in (0..chunks.len()).step_by(batch_size) {
1400            let batch_end = (batch_start + batch_size).min(chunks.len());
1401            let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1402                .iter()
1403                .map(|c| c.embed_text.clone())
1404                .collect();
1405
1406            let vectors = embed_fn(batch_texts)?;
1407            validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1408
1409            if let Some(dim) = vectors.first().map(|v| v.len()) {
1410                match observed_dimension {
1411                    None => observed_dimension = Some(dim),
1412                    Some(expected) if dim != expected => {
1413                        // Refuse to mix dimensions in one index. Caller should
1414                        // fall back to a full rebuild.
1415                        return Err(format!(
1416                            "embedding dimension changed during incremental refresh: \
1417                             cached index uses {expected}, new vectors use {dim}"
1418                        ));
1419                    }
1420                    _ => {}
1421                }
1422            }
1423
1424            for (i, vector) in vectors.into_iter().enumerate() {
1425                let chunk_idx = batch_start + i;
1426                new_entries.push(EmbeddingEntry {
1427                    chunk: chunks[chunk_idx].clone(),
1428                    vector,
1429                });
1430            }
1431
1432            progress(new_entries.len(), total_chunks);
1433        }
1434
1435        let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1436        if !successful_files.is_empty() {
1437            self.entries
1438                .retain(|entry| !successful_files.contains(&entry.chunk.file));
1439        }
1440
1441        self.entries.extend(new_entries);
1442        for (file, metadata) in fresh_metadata {
1443            self.file_mtimes.insert(file.clone(), metadata.mtime);
1444            self.file_sizes.insert(file.clone(), metadata.size);
1445            self.file_hashes.insert(file, metadata.content_hash);
1446        }
1447        if let Some(dim) = observed_dimension {
1448            self.dimension = dim;
1449        }
1450
1451        Ok(RefreshSummary {
1452            changed: changed
1453                .iter()
1454                .filter(|path| successful_files.contains(*path))
1455                .count(),
1456            added: added
1457                .iter()
1458                .filter(|path| successful_files.contains(*path))
1459                .count(),
1460            deleted: deleted.len(),
1461            total_processed,
1462        })
1463    }
1464
1465    /// Search the index with a query embedding, returning top-K results sorted by relevance
1466    pub fn search(&self, query_vector: &[f32], top_k: usize) -> Vec<SemanticResult> {
1467        if self.entries.is_empty() || query_vector.len() != self.dimension {
1468            return Vec::new();
1469        }
1470
1471        let mut scored: Vec<(f32, usize)> = self
1472            .entries
1473            .iter()
1474            .enumerate()
1475            .map(|(i, entry)| {
1476                let mut score = cosine_similarity(query_vector, &entry.vector);
1477                if entry.chunk.exported {
1478                    score *= 1.1;
1479                }
1480                (score, i)
1481            })
1482            .collect();
1483
1484        // Sort descending by score
1485        scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
1486
1487        scored
1488            .into_iter()
1489            .take(top_k)
1490            // Keep the sort → take → map ordering explicit: removing the old
1491            // `> 0.0` floor cannot evict positive hits because top_k has already
1492            // been selected, but it can surface zero-score noise in the tail.
1493            .map(|(score, idx)| {
1494                let entry = &self.entries[idx];
1495                SemanticResult {
1496                    file: entry.chunk.file.clone(),
1497                    name: entry.chunk.name.clone(),
1498                    kind: entry.chunk.kind.clone(),
1499                    start_line: entry.chunk.start_line,
1500                    end_line: entry.chunk.end_line,
1501                    exported: entry.chunk.exported,
1502                    snippet: entry.chunk.snippet.clone(),
1503                    score,
1504                    source: "semantic",
1505                }
1506            })
1507            .collect()
1508    }
1509
1510    /// Number of indexed entries
1511    pub fn len(&self) -> usize {
1512        self.entries.len()
1513    }
1514
1515    /// Check if a file needs re-indexing based on mtime/size
1516    pub fn is_file_stale(&self, file: &Path) -> bool {
1517        let Some(stored_mtime) = self.file_mtimes.get(file) else {
1518            return true;
1519        };
1520        let Some(stored_size) = self.file_sizes.get(file) else {
1521            return true;
1522        };
1523        let Some(stored_hash) = self.file_hashes.get(file) else {
1524            return true;
1525        };
1526        let cached = FileFreshness {
1527            mtime: *stored_mtime,
1528            size: *stored_size,
1529            content_hash: *stored_hash,
1530        };
1531        match cache_freshness::verify_file(file, &cached) {
1532            FreshnessVerdict::HotFresh => false,
1533            FreshnessVerdict::ContentFresh { .. } => false,
1534            FreshnessVerdict::Stale | FreshnessVerdict::Deleted => true,
1535        }
1536    }
1537
1538    fn backfill_missing_file_sizes(&mut self) {
1539        for path in self.file_mtimes.keys() {
1540            if self.file_sizes.contains_key(path) {
1541                continue;
1542            }
1543            if let Ok(metadata) = fs::metadata(path) {
1544                self.file_sizes.insert(path.clone(), metadata.len());
1545                if let Ok(Some(hash)) = cache_freshness::hash_file_if_small(path, metadata.len()) {
1546                    self.file_hashes.insert(path.clone(), hash);
1547                }
1548            }
1549        }
1550    }
1551
1552    /// Remove entries for a specific file
1553    pub fn remove_file(&mut self, file: &Path) {
1554        self.invalidate_file(file);
1555    }
1556
1557    pub fn invalidate_file(&mut self, file: &Path) {
1558        self.entries.retain(|e| e.chunk.file != file);
1559        self.file_mtimes.remove(file);
1560        self.file_sizes.remove(file);
1561        self.file_hashes.remove(file);
1562    }
1563
1564    /// Get the embedding dimension
1565    pub fn dimension(&self) -> usize {
1566        self.dimension
1567    }
1568
1569    pub fn fingerprint(&self) -> Option<&SemanticIndexFingerprint> {
1570        self.fingerprint.as_ref()
1571    }
1572
1573    pub fn backend_label(&self) -> Option<&str> {
1574        self.fingerprint.as_ref().map(|f| f.backend.as_str())
1575    }
1576
1577    pub fn model_label(&self) -> Option<&str> {
1578        self.fingerprint.as_ref().map(|f| f.model.as_str())
1579    }
1580
1581    pub fn set_fingerprint(&mut self, fingerprint: SemanticIndexFingerprint) {
1582        self.fingerprint = Some(fingerprint);
1583    }
1584
1585    /// Write the semantic index to disk using atomic temp+rename pattern
1586    pub fn write_to_disk(&self, storage_dir: &Path, project_key: &str) {
1587        // Don't persist empty indexes — they would be loaded on next startup
1588        // and prevent a fresh build that might find files.
1589        if self.entries.is_empty() {
1590            slog_info!("skipping semantic index persistence (0 entries)");
1591            return;
1592        }
1593        let dir = storage_dir.join("semantic").join(project_key);
1594        if let Err(e) = fs::create_dir_all(&dir) {
1595            slog_warn!("failed to create semantic cache dir: {}", e);
1596            return;
1597        }
1598        let data_path = dir.join("semantic.bin");
1599        let tmp_path = dir.join(format!(
1600            "semantic.bin.tmp.{}.{}",
1601            std::process::id(),
1602            SystemTime::now()
1603                .duration_since(SystemTime::UNIX_EPOCH)
1604                .unwrap_or(Duration::ZERO)
1605                .as_nanos()
1606        ));
1607        let bytes = self.to_bytes();
1608        let write_result = (|| -> std::io::Result<()> {
1609            use std::io::Write;
1610            let mut file = fs::File::create(&tmp_path)?;
1611            file.write_all(&bytes)?;
1612            file.sync_all()?;
1613            Ok(())
1614        })();
1615        if let Err(e) = write_result {
1616            slog_warn!("failed to write semantic index: {}", e);
1617            let _ = fs::remove_file(&tmp_path);
1618            return;
1619        }
1620        if let Err(e) = fs::rename(&tmp_path, &data_path) {
1621            slog_warn!("failed to rename semantic index: {}", e);
1622            let _ = fs::remove_file(&tmp_path);
1623            return;
1624        }
1625        slog_info!(
1626            "semantic index persisted: {} entries, {:.1} KB",
1627            self.entries.len(),
1628            bytes.len() as f64 / 1024.0
1629        );
1630    }
1631
1632    /// Read the semantic index from disk
1633    pub fn read_from_disk(
1634        storage_dir: &Path,
1635        project_key: &str,
1636        current_canonical_root: &Path,
1637        is_worktree_bridge: bool,
1638        expected_fingerprint: Option<&str>,
1639    ) -> Option<Self> {
1640        debug_assert!(current_canonical_root.is_absolute());
1641        let data_path = storage_dir
1642            .join("semantic")
1643            .join(project_key)
1644            .join("semantic.bin");
1645        let file_len = usize::try_from(fs::metadata(&data_path).ok()?.len()).ok()?;
1646        if file_len < HEADER_BYTES_V1 {
1647            slog_warn!(
1648                "corrupt semantic index (too small: {} bytes), removing",
1649                file_len
1650            );
1651            if !is_worktree_bridge {
1652                let _ = fs::remove_file(&data_path);
1653            }
1654            return None;
1655        }
1656
1657        let bytes = fs::read(&data_path).ok()?;
1658        let version = bytes[0];
1659        if version != SEMANTIC_INDEX_VERSION_V6 {
1660            slog_info!(
1661                "cached semantic index version {} is older than {}, rebuilding",
1662                version,
1663                SEMANTIC_INDEX_VERSION_V6
1664            );
1665            if !is_worktree_bridge {
1666                let _ = fs::remove_file(&data_path);
1667            }
1668            return None;
1669        }
1670        match Self::from_bytes(&bytes, current_canonical_root) {
1671            Ok(index) => {
1672                if index.entries.is_empty() {
1673                    slog_info!("cached semantic index is empty, will rebuild");
1674                    if !is_worktree_bridge {
1675                        let _ = fs::remove_file(&data_path);
1676                    }
1677                    return None;
1678                }
1679                if let Some(expected) = expected_fingerprint {
1680                    let matches = index
1681                        .fingerprint()
1682                        .map(|fingerprint| fingerprint.matches_expected(expected))
1683                        .unwrap_or(false);
1684                    if !matches {
1685                        slog_info!("cached semantic index fingerprint mismatch, rebuilding");
1686                        if !is_worktree_bridge {
1687                            let _ = fs::remove_file(&data_path);
1688                        }
1689                        return None;
1690                    }
1691                }
1692                slog_info!(
1693                    "loaded semantic index from disk: {} entries",
1694                    index.entries.len()
1695                );
1696                Some(index)
1697            }
1698            Err(e) => {
1699                slog_warn!("corrupt semantic index, rebuilding: {}", e);
1700                if !is_worktree_bridge {
1701                    let _ = fs::remove_file(&data_path);
1702                }
1703                None
1704            }
1705        }
1706    }
1707
1708    /// Serialize the index to bytes for disk persistence
1709    pub fn to_bytes(&self) -> Vec<u8> {
1710        let mut buf = Vec::new();
1711        let fingerprint_bytes = self.fingerprint.as_ref().and_then(|fingerprint| {
1712            let encoded = fingerprint.as_string();
1713            if encoded.is_empty() {
1714                None
1715            } else {
1716                Some(encoded.into_bytes())
1717            }
1718        });
1719        let file_mtimes: Vec<_> = self
1720            .file_mtimes
1721            .iter()
1722            .filter_map(|(path, mtime)| {
1723                cache_relative_path(&self.project_root, path)
1724                    .map(|relative| (relative, path, mtime))
1725            })
1726            .collect();
1727        let entries: Vec<_> = self
1728            .entries
1729            .iter()
1730            .filter_map(|entry| {
1731                cache_relative_path(&self.project_root, &entry.chunk.file)
1732                    .map(|relative| (relative, entry))
1733            })
1734            .collect();
1735
1736        // Header: version(1) + dimension(4) + entry_count(4) + fingerprint_len(4) + fingerprint
1737        //
1738        // V6 is the single write format. Layout extends V5:
1739        //   - fingerprint is always represented (absent ⇒ fingerprint_len=0,
1740        //     no bytes follow). Uniform format simplifies the reader.
1741        //   - paths are relative to project_root.
1742        //   - file metadata stored as secs(u64) + subsec_nanos(u32) + size(u64) + blake3(32).
1743        //     Preserves full APFS/ext4/NTFS precision and catches mtime ties.
1744        //
1745        // V1/V2 remain readable for backward compatibility (see from_bytes).
1746        // V3/V4 load as compatible formats but are rejected on disk so snippets
1747        // and file sizes are rebuilt once.
1748        let version = SEMANTIC_INDEX_VERSION_V6;
1749        buf.push(version);
1750        buf.extend_from_slice(&(self.dimension as u32).to_le_bytes());
1751        buf.extend_from_slice(&(entries.len() as u32).to_le_bytes());
1752        let fp_bytes_ref: &[u8] = fingerprint_bytes.as_deref().unwrap_or(&[]);
1753        buf.extend_from_slice(&(fp_bytes_ref.len() as u32).to_le_bytes());
1754        buf.extend_from_slice(fp_bytes_ref);
1755
1756        // File mtime table: count(4) + entries
1757        // V3 layout per entry: path_len(4) + path + secs(8) + subsec_nanos(4)
1758        buf.extend_from_slice(&(file_mtimes.len() as u32).to_le_bytes());
1759        for (relative, path, mtime) in &file_mtimes {
1760            let path_bytes = relative.to_string_lossy().as_bytes().to_vec();
1761            buf.extend_from_slice(&(path_bytes.len() as u32).to_le_bytes());
1762            buf.extend_from_slice(&path_bytes);
1763            let duration = mtime
1764                .duration_since(SystemTime::UNIX_EPOCH)
1765                .unwrap_or_default();
1766            buf.extend_from_slice(&duration.as_secs().to_le_bytes());
1767            buf.extend_from_slice(&duration.subsec_nanos().to_le_bytes());
1768            let size = self.file_sizes.get(*path).copied().unwrap_or_default();
1769            buf.extend_from_slice(&size.to_le_bytes());
1770            let hash = self
1771                .file_hashes
1772                .get(*path)
1773                .copied()
1774                .unwrap_or_else(cache_freshness::zero_hash);
1775            buf.extend_from_slice(hash.as_bytes());
1776        }
1777
1778        // Entries: each is metadata + vector
1779        for (relative, entry) in &entries {
1780            let c = &entry.chunk;
1781
1782            // File path
1783            let file_bytes = relative.to_string_lossy().as_bytes().to_vec();
1784            buf.extend_from_slice(&(file_bytes.len() as u32).to_le_bytes());
1785            buf.extend_from_slice(&file_bytes);
1786
1787            // Name
1788            let name_bytes = c.name.as_bytes();
1789            buf.extend_from_slice(&(name_bytes.len() as u32).to_le_bytes());
1790            buf.extend_from_slice(name_bytes);
1791
1792            // Kind (1 byte)
1793            buf.push(symbol_kind_to_u8(&c.kind));
1794
1795            // Lines + exported
1796            buf.extend_from_slice(&(c.start_line as u32).to_le_bytes());
1797            buf.extend_from_slice(&(c.end_line as u32).to_le_bytes());
1798            buf.push(c.exported as u8);
1799
1800            // Snippet
1801            let snippet_bytes = c.snippet.as_bytes();
1802            buf.extend_from_slice(&(snippet_bytes.len() as u32).to_le_bytes());
1803            buf.extend_from_slice(snippet_bytes);
1804
1805            // Embed text
1806            let embed_bytes = c.embed_text.as_bytes();
1807            buf.extend_from_slice(&(embed_bytes.len() as u32).to_le_bytes());
1808            buf.extend_from_slice(embed_bytes);
1809
1810            // Vector (f32 array)
1811            for &val in &entry.vector {
1812                buf.extend_from_slice(&val.to_le_bytes());
1813            }
1814        }
1815
1816        buf
1817    }
1818
1819    /// Deserialize the index from bytes
1820    pub fn from_bytes(data: &[u8], current_canonical_root: &Path) -> Result<Self, String> {
1821        debug_assert!(current_canonical_root.is_absolute());
1822        let mut pos = 0;
1823
1824        if data.len() < HEADER_BYTES_V1 {
1825            return Err("data too short".to_string());
1826        }
1827
1828        let version = data[pos];
1829        pos += 1;
1830        if version != SEMANTIC_INDEX_VERSION_V1
1831            && version != SEMANTIC_INDEX_VERSION_V2
1832            && version != SEMANTIC_INDEX_VERSION_V3
1833            && version != SEMANTIC_INDEX_VERSION_V4
1834            && version != SEMANTIC_INDEX_VERSION_V5
1835            && version != SEMANTIC_INDEX_VERSION_V6
1836        {
1837            return Err(format!("unsupported version: {}", version));
1838        }
1839        // V2 and newer share the same header layout (V3/V4/V5 only differ from
1840        // V2 in the per-mtime entry layout): version(1) + dimension(4) +
1841        // entry_count(4) + fingerprint_len(4) + fingerprint bytes.
1842        if (version == SEMANTIC_INDEX_VERSION_V2
1843            || version == SEMANTIC_INDEX_VERSION_V3
1844            || version == SEMANTIC_INDEX_VERSION_V4
1845            || version == SEMANTIC_INDEX_VERSION_V5
1846            || version == SEMANTIC_INDEX_VERSION_V6)
1847            && data.len() < HEADER_BYTES_V2
1848        {
1849            return Err("data too short for semantic index v2/v3/v4/v5/v6 header".to_string());
1850        }
1851
1852        let dimension = read_u32(data, &mut pos)? as usize;
1853        let entry_count = read_u32(data, &mut pos)? as usize;
1854        validate_embedding_dimension(dimension)?;
1855        if entry_count > MAX_ENTRIES {
1856            return Err(format!("too many semantic index entries: {}", entry_count));
1857        }
1858
1859        // Fingerprint handling:
1860        //   - V1: no fingerprint field at all.
1861        //   - V2: fingerprint_len + fingerprint bytes; always present (writer
1862        //     only emitted V2 when fingerprint was Some).
1863        //   - V3+: fingerprint_len always present; fingerprint_len==0 ⇒ None.
1864        let has_fingerprint_field = version == SEMANTIC_INDEX_VERSION_V2
1865            || version == SEMANTIC_INDEX_VERSION_V3
1866            || version == SEMANTIC_INDEX_VERSION_V4
1867            || version == SEMANTIC_INDEX_VERSION_V5
1868            || version == SEMANTIC_INDEX_VERSION_V6;
1869        let fingerprint = if has_fingerprint_field {
1870            let fingerprint_len = read_u32(data, &mut pos)? as usize;
1871            if pos + fingerprint_len > data.len() {
1872                return Err("unexpected end of data reading fingerprint".to_string());
1873            }
1874            if fingerprint_len == 0 {
1875                None
1876            } else {
1877                let raw = String::from_utf8_lossy(&data[pos..pos + fingerprint_len]).to_string();
1878                pos += fingerprint_len;
1879                Some(
1880                    serde_json::from_str::<SemanticIndexFingerprint>(&raw)
1881                        .map_err(|error| format!("invalid semantic fingerprint: {error}"))?,
1882                )
1883            }
1884        } else {
1885            None
1886        };
1887
1888        // File mtimes
1889        let mtime_count = read_u32(data, &mut pos)? as usize;
1890        if mtime_count > MAX_ENTRIES {
1891            return Err(format!("too many semantic file mtimes: {}", mtime_count));
1892        }
1893
1894        let vector_bytes = entry_count
1895            .checked_mul(dimension)
1896            .and_then(|count| count.checked_mul(F32_BYTES))
1897            .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
1898        if vector_bytes > data.len().saturating_sub(pos) {
1899            return Err("semantic index vectors exceed available data".to_string());
1900        }
1901
1902        let mut file_mtimes = HashMap::with_capacity(mtime_count);
1903        let mut file_sizes = HashMap::with_capacity(mtime_count);
1904        let mut file_hashes = HashMap::with_capacity(mtime_count);
1905        for _ in 0..mtime_count {
1906            let path = read_string(data, &mut pos)?;
1907            let secs = read_u64(data, &mut pos)?;
1908            // V3+ persists subsec_nanos alongside secs so staleness checks
1909            // survive restart round-trips. V1/V2 load with 0 nanos, which
1910            // causes one rebuild on upgrade (they never matched live APFS
1911            // mtimes anyway — the bug v0.15.2 fixes). After that rebuild,
1912            // the cache is persisted as V3 and stabilises.
1913            let nanos = if version == SEMANTIC_INDEX_VERSION_V3
1914                || version == SEMANTIC_INDEX_VERSION_V4
1915                || version == SEMANTIC_INDEX_VERSION_V5
1916                || version == SEMANTIC_INDEX_VERSION_V6
1917            {
1918                read_u32(data, &mut pos)?
1919            } else {
1920                0
1921            };
1922            let size =
1923                if version == SEMANTIC_INDEX_VERSION_V5 || version == SEMANTIC_INDEX_VERSION_V6 {
1924                    read_u64(data, &mut pos)?
1925                } else {
1926                    0
1927                };
1928            let content_hash = if version == SEMANTIC_INDEX_VERSION_V6 {
1929                if pos + 32 > data.len() {
1930                    return Err("unexpected end of data reading content hash".to_string());
1931                }
1932                let mut hash_bytes = [0u8; 32];
1933                hash_bytes.copy_from_slice(&data[pos..pos + 32]);
1934                pos += 32;
1935                blake3::Hash::from_bytes(hash_bytes)
1936            } else {
1937                cache_freshness::zero_hash()
1938            };
1939            // Hardening against corrupt / maliciously crafted cache files
1940            // (v0.15.2). `Duration::new(secs, nanos)` can panic when the
1941            // nanosecond carry overflows the second counter, and
1942            // `SystemTime + Duration` can panic on carry past the platform's
1943            // upper bound. Explicit validation keeps a corrupted semantic.bin
1944            // from taking down the whole aft process.
1945            if nanos >= 1_000_000_000 {
1946                return Err(format!(
1947                    "invalid semantic mtime: nanos {} >= 1_000_000_000",
1948                    nanos
1949                ));
1950            }
1951            let duration = std::time::Duration::new(secs, nanos);
1952            let mtime = SystemTime::UNIX_EPOCH
1953                .checked_add(duration)
1954                .ok_or_else(|| {
1955                    format!(
1956                        "invalid semantic mtime: secs={} nanos={} overflows SystemTime",
1957                        secs, nanos
1958                    )
1959                })?;
1960            let path = if version == SEMANTIC_INDEX_VERSION_V6 {
1961                cached_path_under_root(current_canonical_root, &PathBuf::from(path))
1962                    .ok_or_else(|| "cached semantic mtime path escapes project root".to_string())?
1963            } else {
1964                PathBuf::from(path)
1965            };
1966            file_mtimes.insert(path.clone(), mtime);
1967            file_sizes.insert(path.clone(), size);
1968            file_hashes.insert(path, content_hash);
1969        }
1970
1971        // Entries
1972        let mut entries = Vec::with_capacity(entry_count);
1973        for _ in 0..entry_count {
1974            let raw_file = PathBuf::from(read_string(data, &mut pos)?);
1975            let file = if version == SEMANTIC_INDEX_VERSION_V6 {
1976                cached_path_under_root(current_canonical_root, &raw_file)
1977                    .ok_or_else(|| "cached semantic entry path escapes project root".to_string())?
1978            } else {
1979                raw_file
1980            };
1981            let name = read_string(data, &mut pos)?;
1982
1983            if pos >= data.len() {
1984                return Err("unexpected end of data".to_string());
1985            }
1986            let kind = u8_to_symbol_kind(data[pos]);
1987            pos += 1;
1988
1989            let start_line = read_u32(data, &mut pos)?;
1990            let end_line = read_u32(data, &mut pos)?;
1991
1992            if pos >= data.len() {
1993                return Err("unexpected end of data".to_string());
1994            }
1995            let exported = data[pos] != 0;
1996            pos += 1;
1997
1998            let snippet = read_string(data, &mut pos)?;
1999            let embed_text = read_string(data, &mut pos)?;
2000
2001            // Vector
2002            let vec_bytes = dimension
2003                .checked_mul(F32_BYTES)
2004                .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
2005            if pos + vec_bytes > data.len() {
2006                return Err("unexpected end of data reading vector".to_string());
2007            }
2008            let mut vector = Vec::with_capacity(dimension);
2009            for _ in 0..dimension {
2010                let bytes = [data[pos], data[pos + 1], data[pos + 2], data[pos + 3]];
2011                vector.push(f32::from_le_bytes(bytes));
2012                pos += 4;
2013            }
2014
2015            entries.push(EmbeddingEntry {
2016                chunk: SemanticChunk {
2017                    file,
2018                    name,
2019                    kind,
2020                    start_line,
2021                    end_line,
2022                    exported,
2023                    embed_text,
2024                    snippet,
2025                },
2026                vector,
2027            });
2028        }
2029
2030        if entries.len() != entry_count {
2031            return Err(format!(
2032                "semantic cache entry count drift: header={} decoded={}",
2033                entry_count,
2034                entries.len()
2035            ));
2036        }
2037        for entry in &entries {
2038            if !file_mtimes.contains_key(&entry.chunk.file) {
2039                return Err(format!(
2040                    "semantic cache metadata missing for entry file {}",
2041                    entry.chunk.file.display()
2042                ));
2043            }
2044        }
2045
2046        Ok(Self {
2047            entries,
2048            file_mtimes,
2049            file_sizes,
2050            file_hashes,
2051            dimension,
2052            fingerprint,
2053            project_root: current_canonical_root.to_path_buf(),
2054        })
2055    }
2056}
2057
2058/// Build enriched embedding text from a symbol with cAST-style context
2059fn build_embed_text(symbol: &Symbol, source: &str, file: &Path, project_root: &Path) -> String {
2060    let relative = file
2061        .strip_prefix(project_root)
2062        .unwrap_or(file)
2063        .to_string_lossy();
2064
2065    let kind_label = match &symbol.kind {
2066        SymbolKind::Function => "function",
2067        SymbolKind::Class => "class",
2068        SymbolKind::Method => "method",
2069        SymbolKind::Struct => "struct",
2070        SymbolKind::Interface => "interface",
2071        SymbolKind::Enum => "enum",
2072        SymbolKind::TypeAlias => "type",
2073        SymbolKind::Variable => "variable",
2074        SymbolKind::Heading => "heading",
2075        SymbolKind::FileSummary => "file-summary",
2076    };
2077
2078    // Build: "file:relative/path kind:function name:validateAuth signature:fn validateAuth(token: &str) -> bool"
2079    let name = &symbol.name;
2080    let mut text = format!(
2081        "name:{name} file:{} kind:{} name:{name}",
2082        relative, kind_label
2083    );
2084
2085    if let Some(sig) = &symbol.signature {
2086        text.push_str(&format!(" signature:{}", sig));
2087    }
2088
2089    // Add body snippet (first ~300 chars of symbol body)
2090    let lines: Vec<&str> = source.lines().collect();
2091    let start = (symbol.range.start_line as usize).min(lines.len());
2092    // range.end_line is inclusive 0-based; +1 makes it an exclusive slice bound.
2093    let end = (symbol.range.end_line as usize + 1).min(lines.len());
2094    if start < end {
2095        let body: String = lines[start..end]
2096            .iter()
2097            .take(15) // max 15 lines
2098            .copied()
2099            .collect::<Vec<&str>>()
2100            .join("\n");
2101        let snippet = if body.len() > 300 {
2102            format!("{}...", &body[..body.floor_char_boundary(300)])
2103        } else {
2104            body
2105        };
2106        text.push_str(&format!(" body:{}", snippet));
2107    }
2108
2109    text
2110}
2111
2112fn truncate_chars(value: &str, max_chars: usize) -> String {
2113    value.chars().take(max_chars).collect()
2114}
2115
2116fn first_leading_doc_comment(source: &str) -> String {
2117    let lines: Vec<&str> = source.lines().collect();
2118    let Some((start, first)) = lines
2119        .iter()
2120        .enumerate()
2121        .find(|(_, line)| !line.trim().is_empty())
2122    else {
2123        return String::new();
2124    };
2125
2126    let trimmed = first.trim_start();
2127    if trimmed.starts_with("/**") {
2128        let mut comment = Vec::new();
2129        for line in lines.iter().skip(start) {
2130            comment.push(*line);
2131            if line.contains("*/") {
2132                break;
2133            }
2134        }
2135        return truncate_chars(&comment.join("\n"), 200);
2136    }
2137
2138    if trimmed.starts_with("///") || trimmed.starts_with("//!") {
2139        let comment = lines
2140            .iter()
2141            .skip(start)
2142            .take_while(|line| {
2143                let trimmed = line.trim_start();
2144                trimmed.starts_with("///") || trimmed.starts_with("//!")
2145            })
2146            .copied()
2147            .collect::<Vec<_>>()
2148            .join("\n");
2149        return truncate_chars(&comment, 200);
2150    }
2151
2152    String::new()
2153}
2154
2155pub fn build_file_summary_chunk(
2156    file: &Path,
2157    project_root: &Path,
2158    source: &str,
2159    top_exports: &[&str],
2160    top_export_signatures: &[Option<&str>],
2161) -> SemanticChunk {
2162    let relative = file.strip_prefix(project_root).unwrap_or(file);
2163    let rel_path = relative.to_string_lossy();
2164    let parent_dir = relative
2165        .parent()
2166        .map(|parent| parent.to_string_lossy().to_string())
2167        .unwrap_or_default();
2168    let name = file
2169        .file_stem()
2170        .map(|stem| stem.to_string_lossy().to_string())
2171        .unwrap_or_default();
2172    let doc = first_leading_doc_comment(source);
2173    let exports = top_exports
2174        .iter()
2175        .take(5)
2176        .copied()
2177        .collect::<Vec<_>>()
2178        .join(",");
2179    let snippet = if doc.is_empty() {
2180        top_export_signatures
2181            .first()
2182            .and_then(|signature| signature.as_deref())
2183            .map(|signature| truncate_chars(signature, 200))
2184            .unwrap_or_default()
2185    } else {
2186        doc.clone()
2187    };
2188
2189    SemanticChunk {
2190        file: file.to_path_buf(),
2191        name,
2192        kind: SymbolKind::FileSummary,
2193        start_line: 0,
2194        end_line: 0,
2195        exported: false,
2196        embed_text: format!(
2197            "file:{rel_path} kind:file-summary name:{} parent:{parent_dir} doc:{doc} exports:{exports}",
2198            file.file_stem()
2199                .map(|stem| stem.to_string_lossy().to_string())
2200                .unwrap_or_default()
2201        ),
2202        snippet,
2203    }
2204}
2205
2206fn parser_for(
2207    parsers: &mut HashMap<crate::parser::LangId, Parser>,
2208    lang: crate::parser::LangId,
2209) -> Result<&mut Parser, String> {
2210    use std::collections::hash_map::Entry;
2211
2212    match parsers.entry(lang) {
2213        Entry::Occupied(entry) => Ok(entry.into_mut()),
2214        Entry::Vacant(entry) => {
2215            let grammar = grammar_for(lang);
2216            let mut parser = Parser::new();
2217            parser
2218                .set_language(&grammar)
2219                .map_err(|error| error.to_string())?;
2220            Ok(entry.insert(parser))
2221        }
2222    }
2223}
2224
2225pub fn is_semantic_indexed_extension(path: &Path) -> bool {
2226    matches!(
2227        path.extension().and_then(|extension| extension.to_str()),
2228        Some(
2229            "ts" | "tsx"
2230                | "js"
2231                | "jsx"
2232                | "py"
2233                | "rs"
2234                | "go"
2235                | "c"
2236                | "h"
2237                | "cc"
2238                | "cpp"
2239                | "cxx"
2240                | "hpp"
2241                | "hh"
2242                | "zig"
2243                | "cs"
2244                | "sh"
2245                | "bash"
2246                | "zsh"
2247                | "sol"
2248                | "vue"
2249        )
2250    )
2251}
2252
2253fn collect_file_metadata(file: &Path) -> Result<IndexedFileMetadata, String> {
2254    let metadata = fs::metadata(file).map_err(|error| error.to_string())?;
2255    let mtime = metadata.modified().map_err(|error| error.to_string())?;
2256    let content_hash = cache_freshness::hash_file_if_small(file, metadata.len())
2257        .map_err(|error| error.to_string())?
2258        .unwrap_or_else(cache_freshness::zero_hash);
2259    Ok(IndexedFileMetadata {
2260        mtime,
2261        size: metadata.len(),
2262        content_hash,
2263    })
2264}
2265
2266fn collect_file_chunks(
2267    project_root: &Path,
2268    file: &Path,
2269    parsers: &mut HashMap<crate::parser::LangId, Parser>,
2270) -> Result<Vec<SemanticChunk>, String> {
2271    if !is_semantic_indexed_extension(file) {
2272        return Err("unsupported file extension".to_string());
2273    }
2274    let lang = detect_language(file).ok_or_else(|| "unsupported file extension".to_string())?;
2275    let source = std::fs::read_to_string(file).map_err(|error| error.to_string())?;
2276    let tree = parser_for(parsers, lang)?
2277        .parse(&source, None)
2278        .ok_or_else(|| format!("tree-sitter parse returned None for {}", file.display()))?;
2279    let symbols =
2280        extract_symbols_from_tree(&source, &tree, lang).map_err(|error| error.to_string())?;
2281
2282    Ok(symbols_to_chunks(file, &symbols, &source, project_root))
2283}
2284
2285/// Build a display snippet from a symbol's source
2286fn build_snippet(symbol: &Symbol, source: &str) -> String {
2287    let lines: Vec<&str> = source.lines().collect();
2288    let start = (symbol.range.start_line as usize).min(lines.len());
2289    // range.end_line is inclusive 0-based; +1 makes it an exclusive slice bound.
2290    let end = (symbol.range.end_line as usize + 1).min(lines.len());
2291    if start < end {
2292        let snippet_lines: Vec<&str> = lines[start..end].iter().take(5).copied().collect();
2293        let mut snippet = snippet_lines.join("\n");
2294        if end - start > 5 {
2295            snippet.push_str("\n  ...");
2296        }
2297        if snippet.len() > 300 {
2298            snippet = format!("{}...", &snippet[..snippet.floor_char_boundary(300)]);
2299        }
2300        snippet
2301    } else {
2302        String::new()
2303    }
2304}
2305
2306/// Convert symbols to semantic chunks with enriched context
2307fn symbols_to_chunks(
2308    file: &Path,
2309    symbols: &[Symbol],
2310    source: &str,
2311    project_root: &Path,
2312) -> Vec<SemanticChunk> {
2313    let mut chunks = Vec::new();
2314    let top_exports_with_signatures = symbols
2315        .iter()
2316        .filter(|symbol| {
2317            symbol.exported
2318                && symbol.parent.is_none()
2319                && !matches!(symbol.kind, SymbolKind::Heading)
2320        })
2321        .map(|symbol| (symbol.name.as_str(), symbol.signature.as_deref()))
2322        .collect::<Vec<_>>();
2323
2324    let has_only_headings = !symbols.is_empty()
2325        && symbols
2326            .iter()
2327            .all(|symbol| matches!(symbol.kind, SymbolKind::Heading));
2328    if top_exports_with_signatures.len() <= 2 && !has_only_headings {
2329        let top_exports = top_exports_with_signatures
2330            .iter()
2331            .map(|(name, _)| *name)
2332            .collect::<Vec<_>>();
2333        let top_export_signatures = top_exports_with_signatures
2334            .iter()
2335            .map(|(_, signature)| *signature)
2336            .collect::<Vec<_>>();
2337        chunks.push(build_file_summary_chunk(
2338            file,
2339            project_root,
2340            source,
2341            &top_exports,
2342            &top_export_signatures,
2343        ));
2344    }
2345
2346    for symbol in symbols {
2347        // Skip Markdown / HTML heading chunks: empirically they dominate result
2348        // lists even for code-shaped queries because heading prose embeds well.
2349        // Agents querying for code lose the actual matches under doc noise.
2350        // README/docs queries are still served by grep on the same files.
2351        if matches!(symbol.kind, SymbolKind::Heading) {
2352            continue;
2353        }
2354
2355        // Skip very small symbols (single-line variables, etc.)
2356        let line_count = symbol
2357            .range
2358            .end_line
2359            .saturating_sub(symbol.range.start_line)
2360            + 1;
2361        if line_count < 2 && !matches!(symbol.kind, SymbolKind::Variable) {
2362            continue;
2363        }
2364
2365        let embed_text = build_embed_text(symbol, source, file, project_root);
2366        let snippet = build_snippet(symbol, source);
2367
2368        chunks.push(SemanticChunk {
2369            file: file.to_path_buf(),
2370            name: symbol.name.clone(),
2371            kind: symbol.kind.clone(),
2372            start_line: symbol.range.start_line,
2373            end_line: symbol.range.end_line,
2374            exported: symbol.exported,
2375            embed_text,
2376            snippet,
2377        });
2378
2379        // Note: Nested symbols are handled separately by the outline system
2380        // Each symbol is indexed individually
2381    }
2382
2383    chunks
2384}
2385
2386/// Cosine similarity between two vectors
2387fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
2388    if a.len() != b.len() {
2389        return 0.0;
2390    }
2391
2392    let mut dot = 0.0f32;
2393    let mut norm_a = 0.0f32;
2394    let mut norm_b = 0.0f32;
2395
2396    for i in 0..a.len() {
2397        dot += a[i] * b[i];
2398        norm_a += a[i] * a[i];
2399        norm_b += b[i] * b[i];
2400    }
2401
2402    let denom = norm_a.sqrt() * norm_b.sqrt();
2403    if denom == 0.0 {
2404        0.0
2405    } else {
2406        dot / denom
2407    }
2408}
2409
2410// Serialization helpers
2411fn symbol_kind_to_u8(kind: &SymbolKind) -> u8 {
2412    match kind {
2413        SymbolKind::Function => 0,
2414        SymbolKind::Class => 1,
2415        SymbolKind::Method => 2,
2416        SymbolKind::Struct => 3,
2417        SymbolKind::Interface => 4,
2418        SymbolKind::Enum => 5,
2419        SymbolKind::TypeAlias => 6,
2420        SymbolKind::Variable => 7,
2421        SymbolKind::Heading => 8,
2422        SymbolKind::FileSummary => 9,
2423    }
2424}
2425
2426fn u8_to_symbol_kind(v: u8) -> SymbolKind {
2427    match v {
2428        0 => SymbolKind::Function,
2429        1 => SymbolKind::Class,
2430        2 => SymbolKind::Method,
2431        3 => SymbolKind::Struct,
2432        4 => SymbolKind::Interface,
2433        5 => SymbolKind::Enum,
2434        6 => SymbolKind::TypeAlias,
2435        7 => SymbolKind::Variable,
2436        8 => SymbolKind::Heading,
2437        9 => SymbolKind::FileSummary,
2438        _ => SymbolKind::Heading,
2439    }
2440}
2441
2442fn read_u32(data: &[u8], pos: &mut usize) -> Result<u32, String> {
2443    if *pos + 4 > data.len() {
2444        return Err("unexpected end of data reading u32".to_string());
2445    }
2446    let val = u32::from_le_bytes([data[*pos], data[*pos + 1], data[*pos + 2], data[*pos + 3]]);
2447    *pos += 4;
2448    Ok(val)
2449}
2450
2451fn read_u64(data: &[u8], pos: &mut usize) -> Result<u64, String> {
2452    if *pos + 8 > data.len() {
2453        return Err("unexpected end of data reading u64".to_string());
2454    }
2455    let bytes: [u8; 8] = data[*pos..*pos + 8].try_into().unwrap();
2456    *pos += 8;
2457    Ok(u64::from_le_bytes(bytes))
2458}
2459
2460fn read_string(data: &[u8], pos: &mut usize) -> Result<String, String> {
2461    let len = read_u32(data, pos)? as usize;
2462    if *pos + len > data.len() {
2463        return Err("unexpected end of data reading string".to_string());
2464    }
2465    let s = String::from_utf8_lossy(&data[*pos..*pos + len]).to_string();
2466    *pos += len;
2467    Ok(s)
2468}
2469
2470#[cfg(test)]
2471mod tests {
2472    use super::*;
2473    use crate::config::{SemanticBackend, SemanticBackendConfig};
2474    use crate::parser::FileParser;
2475    use std::io::{Read, Write};
2476    use std::net::TcpListener;
2477    use std::thread;
2478
2479    fn start_mock_http_server<F>(handler: F) -> (String, thread::JoinHandle<()>)
2480    where
2481        F: Fn(String, String, String) -> String + Send + 'static,
2482    {
2483        let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
2484        let addr = listener.local_addr().expect("local addr");
2485        let handle = thread::spawn(move || {
2486            let (mut stream, _) = listener.accept().expect("accept request");
2487            let mut buf = Vec::new();
2488            let mut chunk = [0u8; 4096];
2489            let mut header_end = None;
2490            let mut content_length = 0usize;
2491            loop {
2492                let n = stream.read(&mut chunk).expect("read request");
2493                if n == 0 {
2494                    break;
2495                }
2496                buf.extend_from_slice(&chunk[..n]);
2497                if header_end.is_none() {
2498                    if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
2499                        header_end = Some(pos + 4);
2500                        let headers = String::from_utf8_lossy(&buf[..pos + 4]);
2501                        for line in headers.lines() {
2502                            if let Some(value) = line.strip_prefix("Content-Length:") {
2503                                content_length = value.trim().parse::<usize>().unwrap_or(0);
2504                            }
2505                        }
2506                    }
2507                }
2508                if let Some(end) = header_end {
2509                    if buf.len() >= end + content_length {
2510                        break;
2511                    }
2512                }
2513            }
2514
2515            let end = header_end.expect("header terminator");
2516            let request = String::from_utf8_lossy(&buf[..end]).to_string();
2517            let body = String::from_utf8_lossy(&buf[end..end + content_length]).to_string();
2518            let mut lines = request.lines();
2519            let request_line = lines.next().expect("request line").to_string();
2520            let path = request_line
2521                .split_whitespace()
2522                .nth(1)
2523                .expect("request path")
2524                .to_string();
2525            let response_body = handler(request_line, path, body);
2526            let response = format!(
2527                "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
2528                response_body.len(),
2529                response_body
2530            );
2531            stream
2532                .write_all(response.as_bytes())
2533                .expect("write response");
2534        });
2535
2536        (format!("http://{}", addr), handle)
2537    }
2538
2539    fn test_vector_for_texts(texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
2540        Ok(texts.iter().map(|_| vec![1.0, 0.0, 0.0]).collect())
2541    }
2542
2543    fn write_rust_file(path: &Path, function_name: &str) {
2544        fs::write(
2545            path,
2546            format!("pub fn {function_name}() -> bool {{\n    true\n}}\n"),
2547        )
2548        .unwrap();
2549    }
2550
2551    fn build_test_index(project_root: &Path, files: &[PathBuf]) -> SemanticIndex {
2552        let mut embed = test_vector_for_texts;
2553        SemanticIndex::build(project_root, files, &mut embed, 8).unwrap()
2554    }
2555
2556    fn test_project_root() -> PathBuf {
2557        std::env::current_dir().unwrap()
2558    }
2559
2560    fn set_file_metadata(index: &mut SemanticIndex, file: &Path, mtime: SystemTime, size: u64) {
2561        index.file_mtimes.insert(file.to_path_buf(), mtime);
2562        index.file_sizes.insert(file.to_path_buf(), size);
2563        index
2564            .file_hashes
2565            .insert(file.to_path_buf(), cache_freshness::zero_hash());
2566    }
2567
2568    #[test]
2569    fn semantic_cache_serialization_skips_paths_outside_project_root() {
2570        let dir = tempfile::tempdir().expect("create temp dir");
2571        let project = fs::canonicalize(dir.path()).expect("canonical project");
2572        let outside = project.join("..").join("outside.rs");
2573        let mut index = SemanticIndex::new(project.clone(), 3);
2574        index
2575            .file_mtimes
2576            .insert(outside.clone(), SystemTime::UNIX_EPOCH);
2577        index.file_sizes.insert(outside.clone(), 1);
2578        index
2579            .file_hashes
2580            .insert(outside.clone(), cache_freshness::zero_hash());
2581        index.entries.push(EmbeddingEntry {
2582            chunk: SemanticChunk {
2583                file: outside,
2584                name: "outside".to_string(),
2585                kind: SymbolKind::Function,
2586                start_line: 0,
2587                end_line: 0,
2588                exported: false,
2589                embed_text: "outside".to_string(),
2590                snippet: "outside".to_string(),
2591            },
2592            vector: vec![1.0, 0.0, 0.0],
2593        });
2594
2595        let bytes = index.to_bytes();
2596        let loaded = SemanticIndex::from_bytes(&bytes, &project).expect("load serialized index");
2597        assert_eq!(loaded.entries.len(), 0);
2598        assert!(loaded.file_mtimes.is_empty());
2599    }
2600
2601    #[test]
2602    fn test_cosine_similarity_identical() {
2603        let a = vec![1.0, 0.0, 0.0];
2604        let b = vec![1.0, 0.0, 0.0];
2605        assert!((cosine_similarity(&a, &b) - 1.0).abs() < 0.001);
2606    }
2607
2608    #[test]
2609    fn test_cosine_similarity_orthogonal() {
2610        let a = vec![1.0, 0.0, 0.0];
2611        let b = vec![0.0, 1.0, 0.0];
2612        assert!(cosine_similarity(&a, &b).abs() < 0.001);
2613    }
2614
2615    #[test]
2616    fn test_cosine_similarity_opposite() {
2617        let a = vec![1.0, 0.0, 0.0];
2618        let b = vec![-1.0, 0.0, 0.0];
2619        assert!((cosine_similarity(&a, &b) + 1.0).abs() < 0.001);
2620    }
2621
2622    #[test]
2623    fn test_serialization_roundtrip() {
2624        let project_root = test_project_root();
2625        let file = project_root.join("src/main.rs");
2626        let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
2627        index.entries.push(EmbeddingEntry {
2628            chunk: SemanticChunk {
2629                file: file.clone(),
2630                name: "handle_request".to_string(),
2631                kind: SymbolKind::Function,
2632                start_line: 10,
2633                end_line: 25,
2634                exported: true,
2635                embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
2636                snippet: "fn handle_request() {\n  // ...\n}".to_string(),
2637            },
2638            vector: vec![0.1, 0.2, 0.3, 0.4],
2639        });
2640        index.dimension = 4;
2641        index
2642            .file_mtimes
2643            .insert(file.clone(), SystemTime::UNIX_EPOCH);
2644        index.file_sizes.insert(file, 0);
2645        index.set_fingerprint(SemanticIndexFingerprint {
2646            backend: "fastembed".to_string(),
2647            model: "all-MiniLM-L6-v2".to_string(),
2648            base_url: FALLBACK_BACKEND.to_string(),
2649            dimension: 4,
2650            chunking_version: default_chunking_version(),
2651        });
2652
2653        let bytes = index.to_bytes();
2654        let restored = SemanticIndex::from_bytes(&bytes, &project_root).unwrap();
2655
2656        assert_eq!(restored.entries.len(), 1);
2657        assert_eq!(restored.entries[0].chunk.name, "handle_request");
2658        assert_eq!(restored.entries[0].vector, vec![0.1, 0.2, 0.3, 0.4]);
2659        assert_eq!(restored.dimension, 4);
2660        assert_eq!(restored.backend_label(), Some("fastembed"));
2661        assert_eq!(restored.model_label(), Some("all-MiniLM-L6-v2"));
2662    }
2663
2664    #[test]
2665    fn symbol_kind_serialization_roundtrip_includes_file_summary_variant() {
2666        let cases = [
2667            (SymbolKind::Function, 0),
2668            (SymbolKind::Class, 1),
2669            (SymbolKind::Method, 2),
2670            (SymbolKind::Struct, 3),
2671            (SymbolKind::Interface, 4),
2672            (SymbolKind::Enum, 5),
2673            (SymbolKind::TypeAlias, 6),
2674            (SymbolKind::Variable, 7),
2675            (SymbolKind::Heading, 8),
2676            (SymbolKind::FileSummary, 9),
2677        ];
2678
2679        for (kind, encoded) in cases {
2680            assert_eq!(symbol_kind_to_u8(&kind), encoded);
2681            assert_eq!(u8_to_symbol_kind(encoded), kind);
2682        }
2683    }
2684
2685    #[test]
2686    fn test_search_top_k() {
2687        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
2688        index.dimension = 3;
2689
2690        // Add entries with known vectors
2691        for (i, name) in ["auth", "database", "handler"].iter().enumerate() {
2692            let mut vec = vec![0.0f32; 3];
2693            vec[i] = 1.0; // orthogonal vectors
2694            index.entries.push(EmbeddingEntry {
2695                chunk: SemanticChunk {
2696                    file: PathBuf::from("/src/lib.rs"),
2697                    name: name.to_string(),
2698                    kind: SymbolKind::Function,
2699                    start_line: (i * 10 + 1) as u32,
2700                    end_line: (i * 10 + 5) as u32,
2701                    exported: true,
2702                    embed_text: format!("kind:function name:{}", name),
2703                    snippet: format!("fn {}() {{}}", name),
2704                },
2705                vector: vec,
2706            });
2707        }
2708
2709        // Query aligned with "auth" (index 0)
2710        let query = vec![0.9, 0.1, 0.0];
2711        let results = index.search(&query, 2);
2712
2713        assert_eq!(results.len(), 2);
2714        assert_eq!(results[0].name, "auth"); // highest score
2715        assert!(results[0].score > results[1].score);
2716    }
2717
2718    #[test]
2719    fn test_empty_index_search() {
2720        let index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
2721        let results = index.search(&[0.1, 0.2, 0.3], 10);
2722        assert!(results.is_empty());
2723    }
2724
2725    #[test]
2726    fn single_line_symbol_builds_non_empty_snippet() {
2727        let symbol = Symbol {
2728            name: "answer".to_string(),
2729            kind: SymbolKind::Variable,
2730            range: crate::symbols::Range {
2731                start_line: 0,
2732                start_col: 0,
2733                end_line: 0,
2734                end_col: 24,
2735            },
2736            signature: Some("const answer = 42".to_string()),
2737            scope_chain: Vec::new(),
2738            exported: true,
2739            parent: None,
2740        };
2741        let source = "export const answer = 42;\n";
2742
2743        let snippet = build_snippet(&symbol, source);
2744
2745        assert_eq!(snippet, "export const answer = 42;");
2746    }
2747
2748    #[test]
2749    fn optimized_file_chunk_collection_matches_file_parser_path() {
2750        let project_root = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
2751        let file = project_root.join("src/semantic_index.rs");
2752        let source = std::fs::read_to_string(&file).unwrap();
2753
2754        let mut legacy_parser = FileParser::new();
2755        let legacy_symbols = legacy_parser.extract_symbols(&file).unwrap();
2756        let legacy_chunks = symbols_to_chunks(&file, &legacy_symbols, &source, &project_root);
2757
2758        let mut parsers = HashMap::new();
2759        let optimized_chunks = collect_file_chunks(&project_root, &file, &mut parsers).unwrap();
2760
2761        assert_eq!(
2762            chunk_fingerprint(&optimized_chunks),
2763            chunk_fingerprint(&legacy_chunks)
2764        );
2765    }
2766
2767    fn chunk_fingerprint(
2768        chunks: &[SemanticChunk],
2769    ) -> Vec<(String, SymbolKind, u32, u32, bool, String, String)> {
2770        chunks
2771            .iter()
2772            .map(|chunk| {
2773                (
2774                    chunk.name.clone(),
2775                    chunk.kind.clone(),
2776                    chunk.start_line,
2777                    chunk.end_line,
2778                    chunk.exported,
2779                    chunk.embed_text.clone(),
2780                    chunk.snippet.clone(),
2781                )
2782            })
2783            .collect()
2784    }
2785
2786    #[test]
2787    fn rejects_oversized_dimension_during_deserialization() {
2788        let mut bytes = Vec::new();
2789        bytes.push(1u8);
2790        bytes.extend_from_slice(&((MAX_DIMENSION as u32) + 1).to_le_bytes());
2791        bytes.extend_from_slice(&0u32.to_le_bytes());
2792        bytes.extend_from_slice(&0u32.to_le_bytes());
2793
2794        assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
2795    }
2796
2797    #[test]
2798    fn rejects_oversized_entry_count_during_deserialization() {
2799        let mut bytes = Vec::new();
2800        bytes.push(1u8);
2801        bytes.extend_from_slice(&(DEFAULT_DIMENSION as u32).to_le_bytes());
2802        bytes.extend_from_slice(&((MAX_ENTRIES as u32) + 1).to_le_bytes());
2803        bytes.extend_from_slice(&0u32.to_le_bytes());
2804
2805        assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
2806    }
2807
2808    #[test]
2809    fn invalidate_file_removes_entries_and_mtime() {
2810        let target = PathBuf::from("/src/main.rs");
2811        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
2812        index.entries.push(EmbeddingEntry {
2813            chunk: SemanticChunk {
2814                file: target.clone(),
2815                name: "main".to_string(),
2816                kind: SymbolKind::Function,
2817                start_line: 0,
2818                end_line: 1,
2819                exported: false,
2820                embed_text: "main".to_string(),
2821                snippet: "fn main() {}".to_string(),
2822            },
2823            vector: vec![1.0; DEFAULT_DIMENSION],
2824        });
2825        index
2826            .file_mtimes
2827            .insert(target.clone(), SystemTime::UNIX_EPOCH);
2828        index.file_sizes.insert(target.clone(), 0);
2829
2830        index.invalidate_file(&target);
2831
2832        assert!(index.entries.is_empty());
2833        assert!(!index.file_mtimes.contains_key(&target));
2834        assert!(!index.file_sizes.contains_key(&target));
2835    }
2836
2837    #[test]
2838    fn refresh_transient_error_preserves_existing_entry_and_mtime() {
2839        let temp = tempfile::tempdir().unwrap();
2840        let project_root = temp.path();
2841        let file = project_root.join("src/lib.rs");
2842        fs::create_dir_all(file.parent().unwrap()).unwrap();
2843        write_rust_file(&file, "kept_symbol");
2844
2845        let mut index = build_test_index(project_root, std::slice::from_ref(&file));
2846        let original_entry_count = index.entries.len();
2847        let original_mtime = *index.file_mtimes.get(&file).unwrap();
2848        let original_size = *index.file_sizes.get(&file).unwrap();
2849
2850        let stale_mtime = SystemTime::UNIX_EPOCH;
2851        set_file_metadata(&mut index, &file, stale_mtime, original_size + 1);
2852        fs::remove_file(&file).unwrap();
2853
2854        let mut embed = test_vector_for_texts;
2855        let mut progress = |_done: usize, _total: usize| {};
2856        let summary = index
2857            .refresh_stale_files(
2858                project_root,
2859                std::slice::from_ref(&file),
2860                &mut embed,
2861                8,
2862                &mut progress,
2863            )
2864            .unwrap();
2865
2866        assert_eq!(summary.changed, 0);
2867        assert_eq!(summary.added, 0);
2868        assert_eq!(summary.deleted, 0);
2869        assert_eq!(index.entries.len(), original_entry_count);
2870        assert!(index
2871            .entries
2872            .iter()
2873            .any(|entry| entry.chunk.name == "kept_symbol"));
2874        assert_eq!(index.file_mtimes.get(&file), Some(&stale_mtime));
2875        assert_ne!(index.file_mtimes.get(&file), Some(&original_mtime));
2876        assert_eq!(index.file_sizes.get(&file), Some(&(original_size + 1)));
2877    }
2878
2879    #[test]
2880    fn refresh_never_indexed_file_error_does_not_record_mtime() {
2881        let temp = tempfile::tempdir().unwrap();
2882        let project_root = temp.path();
2883        let missing = project_root.join("src/missing.rs");
2884        fs::create_dir_all(missing.parent().unwrap()).unwrap();
2885
2886        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
2887        let mut embed = test_vector_for_texts;
2888        let mut progress = |_done: usize, _total: usize| {};
2889        let summary = index
2890            .refresh_stale_files(
2891                project_root,
2892                std::slice::from_ref(&missing),
2893                &mut embed,
2894                8,
2895                &mut progress,
2896            )
2897            .unwrap();
2898
2899        assert_eq!(summary.added, 0);
2900        assert_eq!(summary.changed, 0);
2901        assert_eq!(summary.deleted, 0);
2902        assert!(!index.file_mtimes.contains_key(&missing));
2903        assert!(!index.file_sizes.contains_key(&missing));
2904        assert!(index.entries.is_empty());
2905    }
2906
2907    #[test]
2908    fn refresh_reports_added_for_new_files() {
2909        let temp = tempfile::tempdir().unwrap();
2910        let project_root = temp.path();
2911        let existing = project_root.join("src/lib.rs");
2912        let added = project_root.join("src/new.rs");
2913        fs::create_dir_all(existing.parent().unwrap()).unwrap();
2914        write_rust_file(&existing, "existing_symbol");
2915        write_rust_file(&added, "added_symbol");
2916
2917        let mut index = build_test_index(project_root, std::slice::from_ref(&existing));
2918        let mut embed = test_vector_for_texts;
2919        let mut progress = |_done: usize, _total: usize| {};
2920        let summary = index
2921            .refresh_stale_files(
2922                project_root,
2923                &[existing.clone(), added.clone()],
2924                &mut embed,
2925                8,
2926                &mut progress,
2927            )
2928            .unwrap();
2929
2930        assert_eq!(summary.added, 1);
2931        assert_eq!(summary.changed, 0);
2932        assert_eq!(summary.deleted, 0);
2933        assert_eq!(summary.total_processed, 2);
2934        assert!(index.file_mtimes.contains_key(&added));
2935        assert!(index.entries.iter().any(|entry| entry.chunk.file == added));
2936    }
2937
2938    #[test]
2939    fn refresh_reports_deleted_for_removed_files() {
2940        let temp = tempfile::tempdir().unwrap();
2941        let project_root = temp.path();
2942        let deleted = project_root.join("src/deleted.rs");
2943        fs::create_dir_all(deleted.parent().unwrap()).unwrap();
2944        write_rust_file(&deleted, "deleted_symbol");
2945
2946        let mut index = build_test_index(project_root, std::slice::from_ref(&deleted));
2947        fs::remove_file(&deleted).unwrap();
2948
2949        let mut embed = test_vector_for_texts;
2950        let mut progress = |_done: usize, _total: usize| {};
2951        let summary = index
2952            .refresh_stale_files(project_root, &[], &mut embed, 8, &mut progress)
2953            .unwrap();
2954
2955        assert_eq!(summary.deleted, 1);
2956        assert_eq!(summary.changed, 0);
2957        assert_eq!(summary.added, 0);
2958        assert_eq!(summary.total_processed, 1);
2959        assert!(!index.file_mtimes.contains_key(&deleted));
2960        assert!(index.entries.is_empty());
2961    }
2962
2963    #[test]
2964    fn refresh_reports_changed_for_modified_files() {
2965        let temp = tempfile::tempdir().unwrap();
2966        let project_root = temp.path();
2967        let file = project_root.join("src/lib.rs");
2968        fs::create_dir_all(file.parent().unwrap()).unwrap();
2969        write_rust_file(&file, "old_symbol");
2970
2971        let mut index = build_test_index(project_root, std::slice::from_ref(&file));
2972        set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, 0);
2973        write_rust_file(&file, "new_symbol");
2974
2975        let mut embed = test_vector_for_texts;
2976        let mut progress = |_done: usize, _total: usize| {};
2977        let summary = index
2978            .refresh_stale_files(
2979                project_root,
2980                std::slice::from_ref(&file),
2981                &mut embed,
2982                8,
2983                &mut progress,
2984            )
2985            .unwrap();
2986
2987        assert_eq!(summary.changed, 1);
2988        assert_eq!(summary.added, 0);
2989        assert_eq!(summary.deleted, 0);
2990        assert_eq!(summary.total_processed, 1);
2991        assert!(index
2992            .entries
2993            .iter()
2994            .any(|entry| entry.chunk.name == "new_symbol"));
2995        assert!(!index
2996            .entries
2997            .iter()
2998            .any(|entry| entry.chunk.name == "old_symbol"));
2999    }
3000
3001    #[test]
3002    fn refresh_all_clean_reports_zero_counts_and_no_embedding_work() {
3003        let temp = tempfile::tempdir().unwrap();
3004        let project_root = temp.path();
3005        let file = project_root.join("src/lib.rs");
3006        fs::create_dir_all(file.parent().unwrap()).unwrap();
3007        write_rust_file(&file, "clean_symbol");
3008
3009        let mut index = build_test_index(project_root, std::slice::from_ref(&file));
3010        let original_entries = index.entries.len();
3011        let mut embed_called = false;
3012        let mut embed = |texts: Vec<String>| {
3013            embed_called = true;
3014            test_vector_for_texts(texts)
3015        };
3016        let mut progress = |_done: usize, _total: usize| {};
3017        let summary = index
3018            .refresh_stale_files(
3019                project_root,
3020                std::slice::from_ref(&file),
3021                &mut embed,
3022                8,
3023                &mut progress,
3024            )
3025            .unwrap();
3026
3027        assert!(summary.is_noop());
3028        assert_eq!(summary.total_processed, 1);
3029        assert!(!embed_called);
3030        assert_eq!(index.entries.len(), original_entries);
3031    }
3032
3033    #[test]
3034    fn detects_missing_onnx_runtime_from_dynamic_load_error() {
3035        let message = "Failed to load ONNX Runtime shared library libonnxruntime.dylib via dlopen: no such file";
3036
3037        assert!(is_onnx_runtime_unavailable(message));
3038    }
3039
3040    #[test]
3041    fn formats_missing_onnx_runtime_with_install_hint() {
3042        let message = format_embedding_init_error(
3043            "Failed to load ONNX Runtime shared library libonnxruntime.so via dlopen: no such file",
3044        );
3045
3046        assert!(message.starts_with("ONNX Runtime not found. Install via:"));
3047        assert!(message.contains("Original error:"));
3048    }
3049
3050    #[test]
3051    fn openai_compatible_backend_embeds_with_mock_server() {
3052        let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
3053            assert!(request_line.starts_with("POST "));
3054            assert_eq!(path, "/v1/embeddings");
3055            "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0},{\"embedding\":[0.4,0.5,0.6],\"index\":1}]}".to_string()
3056        });
3057
3058        let config = SemanticBackendConfig {
3059            backend: SemanticBackend::OpenAiCompatible,
3060            model: "test-embedding".to_string(),
3061            base_url: Some(base_url),
3062            api_key_env: None,
3063            timeout_ms: 5_000,
3064            max_batch_size: 64,
3065        };
3066
3067        let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
3068        let vectors = model
3069            .embed(vec!["hello".to_string(), "world".to_string()])
3070            .unwrap();
3071
3072        assert_eq!(vectors, vec![vec![0.1, 0.2, 0.3], vec![0.4, 0.5, 0.6]]);
3073        handle.join().unwrap();
3074    }
3075
3076    /// Regression for issue #36: AFT was sending TWO Content-Type headers
3077    /// on the OpenAI embeddings request — once implicitly via `.json(&body)`
3078    /// and again explicitly via `.header("Content-Type", "application/json")`.
3079    /// reqwest's `.header()` calls `HeaderMap::append`, which produces two
3080    /// headers on the wire. OpenAI's /v1/embeddings endpoint rejects that
3081    /// with `HTTP 400 "you must provide a model parameter"` even though the
3082    /// body actually contains `model`. The fix is to drop the explicit
3083    /// `.header("Content-Type", ...)` call. This test pins that we send
3084    /// exactly one Content-Type header.
3085    #[test]
3086    fn openai_compatible_request_has_single_content_type_header() {
3087        use std::sync::{Arc, Mutex};
3088        let captured: Arc<Mutex<Vec<u8>>> = Arc::new(Mutex::new(Vec::new()));
3089        let captured_for_thread = Arc::clone(&captured);
3090
3091        let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
3092        let addr = listener.local_addr().expect("local addr");
3093        let handle = thread::spawn(move || {
3094            let (mut stream, _) = listener.accept().expect("accept");
3095            let mut buf = Vec::new();
3096            let mut chunk = [0u8; 4096];
3097            let mut header_end = None;
3098            let mut content_length = 0usize;
3099            loop {
3100                let n = stream.read(&mut chunk).expect("read");
3101                if n == 0 {
3102                    break;
3103                }
3104                buf.extend_from_slice(&chunk[..n]);
3105                if header_end.is_none() {
3106                    if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
3107                        header_end = Some(pos + 4);
3108                        for line in String::from_utf8_lossy(&buf[..pos + 4]).lines() {
3109                            if let Some(value) = line.strip_prefix("Content-Length:") {
3110                                content_length = value.trim().parse::<usize>().unwrap_or(0);
3111                            }
3112                        }
3113                    }
3114                }
3115                if let Some(end) = header_end {
3116                    if buf.len() >= end + content_length {
3117                        break;
3118                    }
3119                }
3120            }
3121            *captured_for_thread.lock().unwrap() = buf;
3122            let body = "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0}]}";
3123            let response = format!(
3124                "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
3125                body.len(),
3126                body
3127            );
3128            let _ = stream.write_all(response.as_bytes());
3129        });
3130
3131        let config = SemanticBackendConfig {
3132            backend: SemanticBackend::OpenAiCompatible,
3133            model: "text-embedding-3-small".to_string(),
3134            base_url: Some(format!("http://{}", addr)),
3135            api_key_env: None,
3136            timeout_ms: 5_000,
3137            max_batch_size: 64,
3138        };
3139        let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
3140        let _ = model.embed(vec!["probe".to_string()]).unwrap();
3141        handle.join().unwrap();
3142
3143        let bytes = captured.lock().unwrap().clone();
3144        let request = String::from_utf8_lossy(&bytes);
3145
3146        // Lowercase line counts because HTTP headers are case-insensitive
3147        // and reqwest may emit `content-type` in lowercase under HTTP/2.
3148        let content_type_lines = request
3149            .lines()
3150            .filter(|line| {
3151                let lower = line.to_ascii_lowercase();
3152                lower.starts_with("content-type:")
3153            })
3154            .count();
3155        assert_eq!(
3156            content_type_lines, 1,
3157            "expected exactly one Content-Type header but found {content_type_lines}; full request:\n{request}",
3158        );
3159
3160        // The body must still include the model field — pin this so a future
3161        // change can't accidentally drop `model` while fixing duplicate headers.
3162        assert!(
3163            request.contains(r#""model":"text-embedding-3-small""#),
3164            "request body should contain model field; full request:\n{request}",
3165        );
3166    }
3167
3168    #[test]
3169    fn ollama_backend_embeds_with_mock_server() {
3170        let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
3171            assert!(request_line.starts_with("POST "));
3172            assert_eq!(path, "/api/embed");
3173            "{\"embeddings\":[[0.7,0.8,0.9],[1.0,1.1,1.2]]}".to_string()
3174        });
3175
3176        let config = SemanticBackendConfig {
3177            backend: SemanticBackend::Ollama,
3178            model: "embeddinggemma".to_string(),
3179            base_url: Some(base_url),
3180            api_key_env: None,
3181            timeout_ms: 5_000,
3182            max_batch_size: 64,
3183        };
3184
3185        let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
3186        let vectors = model
3187            .embed(vec!["hello".to_string(), "world".to_string()])
3188            .unwrap();
3189
3190        assert_eq!(vectors, vec![vec![0.7, 0.8, 0.9], vec![1.0, 1.1, 1.2]]);
3191        handle.join().unwrap();
3192    }
3193
3194    #[test]
3195    fn read_from_disk_rejects_fingerprint_mismatch() {
3196        let storage = tempfile::tempdir().unwrap();
3197        let project_key = "proj";
3198
3199        let project_root = test_project_root();
3200        let file = project_root.join("src/main.rs");
3201        let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
3202        index.entries.push(EmbeddingEntry {
3203            chunk: SemanticChunk {
3204                file: file.clone(),
3205                name: "handle_request".to_string(),
3206                kind: SymbolKind::Function,
3207                start_line: 10,
3208                end_line: 25,
3209                exported: true,
3210                embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
3211                snippet: "fn handle_request() {}".to_string(),
3212            },
3213            vector: vec![0.1, 0.2, 0.3],
3214        });
3215        index.dimension = 3;
3216        index
3217            .file_mtimes
3218            .insert(file.clone(), SystemTime::UNIX_EPOCH);
3219        index.file_sizes.insert(file, 0);
3220        index.set_fingerprint(SemanticIndexFingerprint {
3221            backend: "openai_compatible".to_string(),
3222            model: "test-embedding".to_string(),
3223            base_url: "http://127.0.0.1:1234/v1".to_string(),
3224            dimension: 3,
3225            chunking_version: default_chunking_version(),
3226        });
3227        index.write_to_disk(storage.path(), project_key);
3228
3229        let matching = index.fingerprint().unwrap().as_string();
3230        assert!(SemanticIndex::read_from_disk(
3231            storage.path(),
3232            project_key,
3233            &project_root,
3234            false,
3235            Some(&matching),
3236        )
3237        .is_some());
3238
3239        let mismatched = SemanticIndexFingerprint {
3240            backend: "ollama".to_string(),
3241            model: "embeddinggemma".to_string(),
3242            base_url: "http://127.0.0.1:11434".to_string(),
3243            dimension: 3,
3244            chunking_version: default_chunking_version(),
3245        }
3246        .as_string();
3247        assert!(SemanticIndex::read_from_disk(
3248            storage.path(),
3249            project_key,
3250            &project_root,
3251            false,
3252            Some(&mismatched),
3253        )
3254        .is_none());
3255    }
3256
3257    #[test]
3258    fn read_from_disk_rejects_v3_cache_for_snippet_rebuild() {
3259        let storage = tempfile::tempdir().unwrap();
3260        let project_key = "proj-v3";
3261        let dir = storage.path().join("semantic").join(project_key);
3262        fs::create_dir_all(&dir).unwrap();
3263
3264        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3265        index.entries.push(EmbeddingEntry {
3266            chunk: SemanticChunk {
3267                file: PathBuf::from("/src/main.rs"),
3268                name: "handle_request".to_string(),
3269                kind: SymbolKind::Function,
3270                start_line: 0,
3271                end_line: 0,
3272                exported: true,
3273                embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
3274                snippet: "fn handle_request() {}".to_string(),
3275            },
3276            vector: vec![0.1, 0.2, 0.3],
3277        });
3278        index.dimension = 3;
3279        index
3280            .file_mtimes
3281            .insert(PathBuf::from("/src/main.rs"), SystemTime::UNIX_EPOCH);
3282        index.file_sizes.insert(PathBuf::from("/src/main.rs"), 0);
3283        let fingerprint = SemanticIndexFingerprint {
3284            backend: "fastembed".to_string(),
3285            model: "test".to_string(),
3286            base_url: FALLBACK_BACKEND.to_string(),
3287            dimension: 3,
3288            chunking_version: default_chunking_version(),
3289        };
3290        index.set_fingerprint(fingerprint.clone());
3291
3292        let mut bytes = index.to_bytes();
3293        bytes[0] = SEMANTIC_INDEX_VERSION_V3;
3294        fs::write(dir.join("semantic.bin"), bytes).unwrap();
3295
3296        assert!(SemanticIndex::read_from_disk(
3297            storage.path(),
3298            project_key,
3299            &test_project_root(),
3300            false,
3301            Some(&fingerprint.as_string())
3302        )
3303        .is_none());
3304        assert!(!dir.join("semantic.bin").exists());
3305    }
3306
3307    fn make_symbol(kind: SymbolKind, name: &str, start: u32, end: u32) -> crate::symbols::Symbol {
3308        crate::symbols::Symbol {
3309            name: name.to_string(),
3310            kind,
3311            range: crate::symbols::Range {
3312                start_line: start,
3313                start_col: 0,
3314                end_line: end,
3315                end_col: 0,
3316            },
3317            signature: None,
3318            scope_chain: Vec::new(),
3319            exported: false,
3320            parent: None,
3321        }
3322    }
3323
3324    /// Heading symbols (Markdown / HTML headings) must NOT be indexed —
3325    /// they overwhelmingly dominated semantic results even on code-shaped
3326    /// queries because heading prose embeds far more strongly than code
3327    /// chunks. Skipping headings keeps aft_search a code-finder.
3328    #[test]
3329    fn symbols_to_chunks_skips_heading_symbols() {
3330        let project_root = PathBuf::from("/proj");
3331        let file = project_root.join("README.md");
3332        let source = "# Title\n\nbody text\n\n## Section\n\nmore text\n";
3333
3334        let symbols = vec![
3335            make_symbol(SymbolKind::Heading, "Title", 0, 2),
3336            make_symbol(SymbolKind::Heading, "Section", 4, 6),
3337        ];
3338
3339        let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
3340        assert!(
3341            chunks.is_empty(),
3342            "Heading symbols must be filtered out before embedding; got {} chunk(s)",
3343            chunks.len()
3344        );
3345    }
3346
3347    /// Code symbols (functions, classes, methods, structs, etc.) must still
3348    /// be indexed alongside the heading skip — otherwise we'd starve the
3349    /// index entirely.
3350    #[test]
3351    fn symbols_to_chunks_keeps_code_symbols_alongside_skipped_headings() {
3352        let project_root = PathBuf::from("/proj");
3353        let file = project_root.join("src/lib.rs");
3354        let source = "pub fn handle_request() -> bool {\n    true\n}\n";
3355
3356        let symbols = vec![
3357            // A heading mixed in (e.g. from a doc comment block elsewhere).
3358            make_symbol(SymbolKind::Heading, "doc heading", 0, 1),
3359            make_symbol(SymbolKind::Function, "handle_request", 0, 2),
3360            make_symbol(SymbolKind::Struct, "AuthService", 4, 6),
3361        ];
3362
3363        let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
3364        assert_eq!(
3365            chunks.len(),
3366            3,
3367            "Expected file-summary + 2 code chunks (Function + Struct), got {}",
3368            chunks.len()
3369        );
3370        let names: Vec<&str> = chunks.iter().map(|c| c.name.as_str()).collect();
3371        assert!(chunks
3372            .iter()
3373            .any(|chunk| matches!(chunk.kind, SymbolKind::FileSummary)));
3374        assert!(names.contains(&"handle_request"));
3375        assert!(names.contains(&"AuthService"));
3376        assert!(
3377            !names.contains(&"doc heading"),
3378            "Heading symbol leaked into chunks: {names:?}"
3379        );
3380    }
3381
3382    #[test]
3383    fn validate_ssrf_allows_loopback_hostnames() {
3384        // Loopback hostnames are explicitly allowed so self-hosted backends
3385        // (Ollama at http://localhost:11434) work at their default config.
3386        for host in &[
3387            "http://localhost",
3388            "http://localhost:8080",
3389            "http://localhost:11434", // Ollama default
3390            "http://localhost.localdomain",
3391            "http://foo.localhost",
3392        ] {
3393            assert!(
3394                validate_base_url_no_ssrf(host).is_ok(),
3395                "Expected {host} to be allowed (loopback), got: {:?}",
3396                validate_base_url_no_ssrf(host)
3397            );
3398        }
3399    }
3400
3401    #[test]
3402    fn validate_ssrf_allows_loopback_ips() {
3403        // 127.0.0.0/8 is loopback — by definition same-machine and not an
3404        // SSRF target. Allow it so Ollama at http://127.0.0.1:11434 works.
3405        for url in &[
3406            "http://127.0.0.1",
3407            "http://127.0.0.1:11434", // Ollama default
3408            "http://127.0.0.1:8080",
3409            "http://127.1.2.3",
3410        ] {
3411            let result = validate_base_url_no_ssrf(url);
3412            assert!(
3413                result.is_ok(),
3414                "Expected {url} to be allowed (loopback), got: {:?}",
3415                result
3416            );
3417        }
3418    }
3419
3420    #[test]
3421    fn validate_ssrf_rejects_private_non_loopback_ips() {
3422        // Non-loopback private/reserved IPs remain rejected — homelab/intranet
3423        // services on LAN IPs are real SSRF targets even though the user
3424        // configured them. Users who want this can opt in by binding the
3425        // service to a public-routable address.
3426        for url in &[
3427            "http://192.168.1.1",
3428            "http://10.0.0.1",
3429            "http://172.16.0.1",
3430            "http://169.254.169.254",
3431            "http://100.64.0.1",
3432        ] {
3433            let result = validate_base_url_no_ssrf(url);
3434            assert!(
3435                result.is_err(),
3436                "Expected {url} to be rejected (non-loopback private), got: {:?}",
3437                result
3438            );
3439        }
3440    }
3441
3442    #[test]
3443    fn validate_ssrf_rejects_mdns_local_hostnames() {
3444        // mDNS .local hostnames typically resolve to LAN devices, not
3445        // loopback. Rejecting them before DNS lookup gives a clearer error.
3446        for host in &[
3447            "http://printer.local",
3448            "http://nas.local:8080",
3449            "http://homelab.local",
3450        ] {
3451            let result = validate_base_url_no_ssrf(host);
3452            assert!(
3453                result.is_err(),
3454                "Expected {host} to be rejected (mDNS), got: {:?}",
3455                result
3456            );
3457        }
3458    }
3459
3460    #[test]
3461    fn normalize_base_url_allows_localhost_for_tests() {
3462        // normalize_base_url itself should NOT block localhost — only
3463        // validate_base_url_no_ssrf does. Tests construct backends directly.
3464        assert!(normalize_base_url("http://127.0.0.1:9999").is_ok());
3465        assert!(normalize_base_url("http://localhost:8080").is_ok());
3466    }
3467
3468    /// Pin the user-facing wording of the ONNX version-mismatch error.
3469    /// The auto-fix path MUST be listed first because it's the only safe
3470    /// option that doesn't require sudo or risk breaking other apps that
3471    /// link the system library. Regression of any of these strings would
3472    /// either mislead users (system rm before auto-fix) or break the
3473    /// `aft doctor --fix` discovery path.
3474    #[test]
3475    fn ort_mismatch_message_recommends_auto_fix_first() {
3476        let msg =
3477            format_ort_version_mismatch("1.9.0", "/usr/lib/x86_64-linux-gnu/libonnxruntime.so");
3478
3479        // The reported version and path must appear verbatim.
3480        assert!(
3481            msg.contains("v1.9.0"),
3482            "should report detected version: {msg}"
3483        );
3484        assert!(
3485            msg.contains("/usr/lib/x86_64-linux-gnu/libonnxruntime.so"),
3486            "should report system path: {msg}"
3487        );
3488        assert!(msg.contains("v1.20+"), "should state requirement: {msg}");
3489
3490        // Solution ordering: auto-fix is #1, system rm is #2, install is #3.
3491        let auto_fix_pos = msg
3492            .find("Auto-fix")
3493            .expect("Auto-fix solution missing — users won't discover --fix");
3494        let remove_pos = msg
3495            .find("Remove the old library")
3496            .expect("system-rm solution missing");
3497        assert!(
3498            auto_fix_pos < remove_pos,
3499            "Auto-fix must come before manual rm — see PR comment thread"
3500        );
3501
3502        // The auto-fix command must be runnable as-is on a fresh system.
3503        assert!(
3504            msg.contains("npx @cortexkit/aft doctor --fix"),
3505            "auto-fix command must be present and copy-pasteable: {msg}"
3506        );
3507    }
3508
3509    /// macOS dylib paths must not produce a malformed message when the
3510    /// system path lacks a trailing slash. This is a regression guard
3511    /// for the "{}\n{}" format string contract.
3512    #[test]
3513    fn ort_mismatch_message_handles_macos_dylib_path() {
3514        let msg = format_ort_version_mismatch("1.9.0", "/opt/homebrew/lib/libonnxruntime.dylib");
3515        assert!(msg.contains("v1.9.0"));
3516        assert!(msg.contains("/opt/homebrew/lib/libonnxruntime.dylib"));
3517        // The dylib path must appear in the auto-fix paragraph (single
3518        // quotes around it) AND in the manual-rm paragraph; verify
3519        // both placements survived the format string.
3520        assert!(
3521            msg.contains("'/opt/homebrew/lib/libonnxruntime.dylib'"),
3522            "system path should be quoted in the auto-fix sentence: {msg}"
3523        );
3524    }
3525}
aft/semantic_index.rs

aft/
semantic_index.rs