aft/
semantic_index.rs

1use crate::cache_freshness::{self, FileFreshness, FreshnessVerdict};
2use crate::config::{SemanticBackend, SemanticBackendConfig};
3use crate::fs_lock;
4use crate::parser::{detect_language, extract_symbols_from_tree, grammar_for};
5use crate::search_index::{cache_relative_path, cached_path_under_root};
6use crate::symbols::{Symbol, SymbolKind};
7use crate::{slog_info, slog_warn};
8
9use crate::local_embed::LocalEmbedder;
10use rayon::prelude::*;
11use reqwest::blocking::Client;
12use serde::{Deserialize, Serialize};
13use std::collections::{HashMap, HashSet, VecDeque};
14use std::env;
15use std::fmt::Display;
16use std::fs;
17use std::path::{Path, PathBuf};
18use std::sync::Mutex;
19use std::time::Duration;
20use std::time::SystemTime;
21use tree_sitter::Parser;
22use url::Url;
23
24const DEFAULT_DIMENSION: usize = 384;
25const MAX_ENTRIES: usize = 1_000_000;
26// Covers high-dimensional backends such as OpenAI text-embedding-3-large (3072)
27// and common local models (4096) while keeping a bounded supported shape.
28const MAX_DIMENSION: usize = 4096;
29const F32_BYTES: usize = std::mem::size_of::<f32>();
30const HEADER_BYTES_V1: usize = 9;
31const HEADER_BYTES_V2: usize = 13;
32const ONNX_RUNTIME_INSTALL_HINT: &str =
33    "ONNX Runtime not found. Install via: brew install onnxruntime (macOS), \
34     apt install libonnxruntime (Linux), or place onnxruntime.dll in your PATH (Windows). \
35     AFT can auto-download ONNX Runtime — run `npx @cortexkit/aft doctor` to diagnose.";
36
37const SEMANTIC_INDEX_VERSION_V1: u8 = 1;
38const SEMANTIC_INDEX_VERSION_V2: u8 = 2;
39/// V3 adds subsec_nanos to the file-mtime table so staleness detection survives
40/// restart round-trips on filesystems with subsecond mtime precision (APFS,
41/// ext4 with nsec, NTFS). V1/V2 persisted whole-second mtimes only, which
42/// caused every restart to flag ~99% of files as stale and re-embed them.
43const SEMANTIC_INDEX_VERSION_V3: u8 = 3;
44/// V4 keeps the V3 on-disk layout but rebuilds persisted snippets once after
45/// fixing symbol ranges that were incorrectly treated as 1-based.
46const SEMANTIC_INDEX_VERSION_V4: u8 = 4;
47/// V5 adds file sizes to the file metadata table so incremental staleness
48/// detection can catch content changes even when mtime precision misses them.
49const SEMANTIC_INDEX_VERSION_V5: u8 = 5;
50/// V6 stores paths relative to project_root and adds content hashes.
51const SEMANTIC_INDEX_VERSION_V6: u8 = 6;
52const DEFAULT_OPENAI_EMBEDDING_PATH: &str = "/embeddings";
53const DEFAULT_OLLAMA_EMBEDDING_PATH: &str = "/api/embed";
54// Must stay below the bridge timeout (30s) to avoid bridge kills on slow backends.
55const DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS: u64 = 25_000;
56const DEFAULT_MAX_BATCH_SIZE: usize = 64;
57const QUERY_EMBEDDING_CACHE_CAP: usize = 1_000;
58const FALLBACK_BACKEND: &str = "none";
59const EMBEDDING_REQUEST_MAX_ATTEMPTS: usize = 3;
60const EMBEDDING_REQUEST_BACKOFF_MS: [u64; 2] = [500, 1_000];
61static SEMANTIC_LOCK_ACQUIRE_MUTEX: Mutex<()> = Mutex::new(());
62
63pub struct SemanticIndexLock {
64    _guard: fs_lock::LockGuard,
65}
66
67impl SemanticIndexLock {
68    pub fn acquire(storage_dir: &Path, project_key: &str) -> std::io::Result<Self> {
69        let dir = storage_dir.join("semantic").join(project_key);
70        fs::create_dir_all(&dir)?;
71        let path = dir.join("cache.lock");
72        let _acquire_guard = SEMANTIC_LOCK_ACQUIRE_MUTEX
73            .lock()
74            .map_err(|_| std::io::Error::other("semantic cache lock acquisition mutex poisoned"))?;
75        fs_lock::try_acquire(&path, Duration::from_secs(2))
76            .map(|guard| Self { _guard: guard })
77            .map_err(|error| match error {
78                fs_lock::AcquireError::Timeout => {
79                    std::io::Error::other("timed out acquiring semantic cache lock")
80                }
81                fs_lock::AcquireError::Io(error) => error,
82            })
83    }
84}
85
86#[derive(Debug, Clone, Serialize, Deserialize)]
87pub struct SemanticIndexFingerprint {
88    pub backend: String,
89    pub model: String,
90    #[serde(default)]
91    pub base_url: String,
92    pub dimension: usize,
93    #[serde(default = "default_chunking_version")]
94    pub chunking_version: u32,
95}
96
97fn default_chunking_version() -> u32 {
98    2
99}
100
101impl SemanticIndexFingerprint {
102    fn from_config(config: &SemanticBackendConfig, dimension: usize) -> Self {
103        // Use normalized URL for fingerprinting so cosmetic differences
104        // (e.g. "http://host/v1" vs "http://host/v1/") don't cause rebuilds.
105        let base_url = config
106            .base_url
107            .as_ref()
108            .and_then(|u| normalize_base_url(u).ok())
109            .unwrap_or_else(|| FALLBACK_BACKEND.to_string());
110        Self {
111            backend: config.backend.as_str().to_string(),
112            model: config.model.clone(),
113            base_url,
114            dimension,
115            chunking_version: default_chunking_version(),
116        }
117    }
118
119    pub fn as_string(&self) -> String {
120        serde_json::to_string(self).unwrap_or_else(|_| String::new())
121    }
122
123    fn matches_expected(&self, expected: &str) -> bool {
124        let encoded = self.as_string();
125        !encoded.is_empty() && encoded == expected
126    }
127}
128
129enum SemanticEmbeddingEngine {
130    /// Local ONNX embedder (all-MiniLM-L6-v2 via raw `ort`). The config-facing
131    /// backend string stays "fastembed" for index-fingerprint compatibility.
132    Local(LocalEmbedder),
133    OpenAiCompatible {
134        client: Client,
135        model: String,
136        base_url: String,
137        api_key: Option<String>,
138    },
139    Ollama {
140        client: Client,
141        model: String,
142        base_url: String,
143    },
144}
145
146pub struct SemanticEmbeddingModel {
147    backend: SemanticBackend,
148    model: String,
149    base_url: Option<String>,
150    timeout_ms: u64,
151    max_batch_size: usize,
152    dimension: Option<usize>,
153    engine: SemanticEmbeddingEngine,
154    query_embedding_cache: HashMap<String, Vec<f32>>,
155    query_embedding_cache_order: VecDeque<String>,
156    query_embedding_cache_hits: u64,
157    query_embedding_cache_misses: u64,
158}
159
160pub type EmbeddingModel = SemanticEmbeddingModel;
161
162fn validate_embedding_batch(
163    vectors: &[Vec<f32>],
164    expected_count: usize,
165    context: &str,
166) -> Result<(), String> {
167    if expected_count > 0 && vectors.is_empty() {
168        return Err(format!(
169            "{context} returned no vectors for {expected_count} inputs"
170        ));
171    }
172
173    if vectors.len() != expected_count {
174        return Err(format!(
175            "{context} returned {} vectors for {} inputs",
176            vectors.len(),
177            expected_count
178        ));
179    }
180
181    let Some(first_vector) = vectors.first() else {
182        return Ok(());
183    };
184    let expected_dimension = first_vector.len();
185    validate_embedding_dimension(expected_dimension)
186        .map_err(|error| format!("{context} returned {error}"))?;
187    for (index, vector) in vectors.iter().enumerate() {
188        if vector.len() != expected_dimension {
189            return Err(format!(
190                "{context} returned inconsistent embedding dimensions: vector 0 has length {expected_dimension}, vector {index} has length {}",
191                vector.len()
192            ));
193        }
194    }
195
196    Ok(())
197}
198
199fn validate_embedding_dimension(dimension: usize) -> Result<(), String> {
200    if dimension == 0 || dimension > MAX_DIMENSION {
201        return Err(format!(
202            "invalid embedding dimension: {dimension}; supported range is 1..={MAX_DIMENSION}"
203        ));
204    }
205
206    Ok(())
207}
208
209/// Normalize a base URL: validate scheme and strip trailing slash.
210/// Does NOT perform SSRF/private-IP validation — call
211/// `validate_base_url_no_ssrf` separately when processing user-supplied config.
212fn normalize_base_url(raw: &str) -> Result<String, String> {
213    let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
214    let scheme = parsed.scheme();
215    if scheme != "http" && scheme != "https" {
216        return Err(format!(
217            "unsupported URL scheme '{}' — only http:// and https:// are allowed",
218            scheme
219        ));
220    }
221    Ok(parsed.to_string().trim_end_matches('/').to_string())
222}
223
224/// Validate that a base URL does not point to a private/loopback address.
225/// Call this on user-supplied config (at configure time) to prevent SSRF.
226/// Not called for programmatically constructed configs (e.g. tests).
227///
228/// **Loopback is allowed.** Self-hosted embedding backends (e.g. Ollama at
229/// `http://127.0.0.1:11434`) are a primary use case for `aft_search`. Loopback
230/// addresses by definition cannot be exploited as SSRF targets — they only
231/// reach services on the same machine. Allowing loopback unblocks Ollama at its
232/// default config without opening up SSRF to LAN/intranet services, which
233/// remain rejected.
234///
235/// **mDNS `.local` is rejected.** mDNS hostnames typically resolve to LAN
236/// devices (printers, homelab servers); rejecting them before DNS lookup keeps
237/// the SSRF guard meaningful for non-loopback private networks.
238pub fn validate_base_url_no_ssrf(raw: &str) -> Result<(), String> {
239    use std::net::{IpAddr, ToSocketAddrs};
240
241    let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
242
243    let host = parsed.host_str().unwrap_or("");
244
245    // Loopback hostnames are explicitly allowed. RFC 6761 mandates that
246    // `localhost` and `*.localhost` resolve to loopback;
247    // `localhost.localdomain` is a historical alias used on some Linux
248    // distros. Self-hosted backends like Ollama use these by default.
249    let is_loopback_host =
250        host == "localhost" || host == "localhost.localdomain" || host.ends_with(".localhost");
251    if is_loopback_host {
252        return Ok(());
253    }
254
255    // mDNS hostnames are typically LAN devices, not loopback. Reject before
256    // DNS lookup so users get a clear error rather than a private-IP error.
257    if host.ends_with(".local") {
258        return Err(format!(
259            "base_url host '{host}' is an mDNS name — only loopback (localhost / 127.0.0.1) and public endpoints are allowed"
260        ));
261    }
262
263    // Resolve the hostname. Reject private/link-local/CGNAT IPs but NOT
264    // loopback (which is by definition same-machine and not an SSRF target).
265    let port = parsed.port_or_known_default().unwrap_or(443);
266    let addr_str = format!("{host}:{port}");
267    let addrs: Vec<IpAddr> = addr_str
268        .to_socket_addrs()
269        .map(|iter| iter.map(|sa| sa.ip()).collect())
270        .unwrap_or_default();
271    for ip in &addrs {
272        if is_private_non_loopback_ip(ip) {
273            return Err(format!(
274                "base_url '{raw}' resolves to a private/reserved IP — only loopback (127.0.0.1) and public endpoints are allowed"
275            ));
276        }
277    }
278
279    Ok(())
280}
281
282/// Returns true for IPv4/IPv6 addresses in private/link-local/CGNAT/wildcard
283/// ranges, EXCLUDING loopback (127.0.0.0/8 and ::1). Loopback is considered
284/// safe for SSRF purposes — see [`validate_base_url_no_ssrf`] for rationale.
285fn is_private_non_loopback_ip(ip: &std::net::IpAddr) -> bool {
286    use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
287    match ip {
288        IpAddr::V4(v4) => {
289            let o = v4.octets();
290            // Note: 127.0.0.0/8 (loopback) is intentionally NOT in this set.
291            // 10.0.0.0/8
292            o[0] == 10
293            // 172.16.0.0/12
294            || (o[0] == 172 && (16..=31).contains(&o[1]))
295            // 192.168.0.0/16
296            || (o[0] == 192 && o[1] == 168)
297            // 169.254.0.0/16 link-local
298            || (o[0] == 169 && o[1] == 254)
299            // 100.64.0.0/10 CGNAT
300            || (o[0] == 100 && (64..=127).contains(&o[1]))
301            // 0.0.0.0/8 wildcard
302            || o[0] == 0
303        }
304        IpAddr::V6(v6) => {
305            // Note: ::1 (loopback) is intentionally NOT in this set.
306            let _ = Ipv6Addr::LOCALHOST; // touch to silence unused-import lints in some builds
307                                         // fe80::/10 link-local
308            (v6.segments()[0] & 0xffc0) == 0xfe80
309            // fc00::/7 unique-local
310            || (v6.segments()[0] & 0xfe00) == 0xfc00
311            // ::ffff:0:0/96 IPv4-mapped — check the embedded IPv4
312            || (v6.segments()[0] == 0 && v6.segments()[1] == 0
313                && v6.segments()[2] == 0 && v6.segments()[3] == 0
314                && v6.segments()[4] == 0 && v6.segments()[5] == 0xffff
315                && {
316                    let [a, b] = v6.segments()[6..8] else { return false; };
317                    let ipv4 = Ipv4Addr::new((a >> 8) as u8, (a & 0xff) as u8, (b >> 8) as u8, (b & 0xff) as u8);
318                    is_private_non_loopback_ip(&IpAddr::V4(ipv4))
319                })
320        }
321    }
322}
323
324fn build_openai_embeddings_endpoint(base_url: &str) -> String {
325    if base_url.ends_with("/v1") {
326        format!("{base_url}{DEFAULT_OPENAI_EMBEDDING_PATH}")
327    } else {
328        format!("{base_url}/v1{}", DEFAULT_OPENAI_EMBEDDING_PATH)
329    }
330}
331
332fn build_ollama_embeddings_endpoint(base_url: &str) -> String {
333    if base_url.ends_with("/api") {
334        format!("{base_url}/embed")
335    } else {
336        format!("{base_url}{DEFAULT_OLLAMA_EMBEDDING_PATH}")
337    }
338}
339
340fn normalize_api_key(value: Option<String>) -> Option<String> {
341    value.and_then(|token| {
342        let token = token.trim();
343        if token.is_empty() {
344            None
345        } else {
346            Some(token.to_string())
347        }
348    })
349}
350
351fn is_retryable_embedding_status(status: reqwest::StatusCode) -> bool {
352    status.is_server_error() || status == reqwest::StatusCode::TOO_MANY_REQUESTS
353}
354
355/// Local backends (LM Studio, Ollama, llama.cpp) return a 4xx — usually 400 —
356/// while a model is loading or was just unloaded, with a body message like
357/// "Model was unloaded while the request was still in queue" or "model is
358/// loading". These are transient (the model will reload), not the permanent
359/// misconfigurations a 4xx normally signals, so we ride them out instead of
360/// parking the index in `Failed`. Matched case-insensitively on the response
361/// body so a mid-build model swap self-heals.
362fn embedding_response_body_is_transient(raw: &str) -> bool {
363    let lower = raw.to_ascii_lowercase();
364    lower.contains("model was unloaded")
365        || lower.contains("model is loading")
366        || lower.contains("model not loaded")
367        || lower.contains("loading model")
368        || lower.contains("is currently loading")
369        || lower.contains("model is being loaded")
370}
371
372fn is_retryable_embedding_error(error: &reqwest::Error) -> bool {
373    error.is_connect()
374}
375
376/// Whether a send-time error means the backend is *unreachable or temporarily
377/// failing* (vs. a real misconfiguration). Broader than the in-request retry
378/// predicate: a per-request timeout is transient for the build/refresh layer
379/// (the model may still be cold-loading) but we don't burn the 3 fast
380/// in-request attempts on it — the build-level retry rides it out instead.
381fn embedding_send_error_is_transient(error: &reqwest::Error) -> bool {
382    error.is_connect() || error.is_timeout()
383}
384
385/// Stable machine marker prefixed onto embedding error strings whose root cause
386/// is transient — the backend is down, timing out, or returning 5xx/429, not
387/// misconfigured. The build and corpus-refresh layers key retry-vs-give-up on
388/// this marker (see [`embedding_failure_is_transient`]) instead of re-parsing
389/// error text, so transience stays authoritative at the one site that knows it.
390/// Stripped before any user-facing display via [`strip_transient_embedding_marker`].
391pub const TRANSIENT_EMBEDDING_MARKER: &str = "[transient] ";
392
393/// True when an embedding error carries the transient marker — i.e. retrying
394/// once the backend recovers is the right move, not surfacing a hard failure.
395pub fn embedding_failure_is_transient(error: &str) -> bool {
396    error.contains(TRANSIENT_EMBEDDING_MARKER)
397}
398
399/// Remove the machine transient marker so the message is clean for display.
400pub fn strip_transient_embedding_marker(error: &str) -> String {
401    error.replace(TRANSIENT_EMBEDDING_MARKER, "")
402}
403
404fn sleep_before_embedding_retry(attempt_index: usize) {
405    if let Some(delay_ms) = EMBEDDING_REQUEST_BACKOFF_MS.get(attempt_index) {
406        std::thread::sleep(Duration::from_millis(*delay_ms));
407    }
408}
409
410fn send_embedding_request<F>(mut make_request: F, backend_label: &str) -> Result<String, String>
411where
412    F: FnMut() -> reqwest::blocking::RequestBuilder,
413{
414    for attempt_index in 0..EMBEDDING_REQUEST_MAX_ATTEMPTS {
415        let last_attempt = attempt_index + 1 == EMBEDDING_REQUEST_MAX_ATTEMPTS;
416
417        let response = match make_request().send() {
418            Ok(response) => response,
419            Err(error) => {
420                if !last_attempt && is_retryable_embedding_error(&error) {
421                    sleep_before_embedding_retry(attempt_index);
422                    continue;
423                }
424                // Connect/timeout failures mean the backend is unreachable or
425                // cold-loading — mark transient so the build layer rides it out
426                // and self-heals instead of parking the index in `Failed`.
427                let marker = if embedding_send_error_is_transient(&error) {
428                    TRANSIENT_EMBEDDING_MARKER
429                } else {
430                    ""
431                };
432                return Err(format!("{marker}{backend_label} request failed: {error}"));
433            }
434        };
435
436        let status = response.status();
437        let raw = match response.text() {
438            Ok(raw) => raw,
439            Err(error) => {
440                if !last_attempt && is_retryable_embedding_error(&error) {
441                    sleep_before_embedding_retry(attempt_index);
442                    continue;
443                }
444                return Err(format!("{backend_label} response read failed: {error}"));
445            }
446        };
447
448        if status.is_success() {
449            return Ok(raw);
450        }
451
452        // A 4xx whose body says the model is loading/unloaded is transient on
453        // local backends (LM Studio/Ollama), so treat it like a retryable
454        // status: ride it out at both the in-request and build-retry layers.
455        let body_transient = embedding_response_body_is_transient(&raw);
456        if !last_attempt && (is_retryable_embedding_status(status) || body_transient) {
457            sleep_before_embedding_retry(attempt_index);
458            continue;
459        }
460
461        // 5xx / 429 are server-side and transient — the backend is overloaded
462        // or briefly unavailable, not misconfigured. A 4xx whose body indicates
463        // the model is (un)loading is also transient (local backend mid-swap).
464        // Other 4xx (auth, bad request, model-not-found) is a real error the
465        // user must fix; no marker.
466        let marker = if is_retryable_embedding_status(status) || body_transient {
467            TRANSIENT_EMBEDDING_MARKER
468        } else {
469            ""
470        };
471        return Err(format!(
472            "{marker}{backend_label} request failed (HTTP {}): {}",
473            status, raw
474        ));
475    }
476
477    unreachable!("embedding request retries exhausted without returning")
478}
479
480impl SemanticEmbeddingModel {
481    pub fn from_config(config: &SemanticBackendConfig) -> Result<Self, String> {
482        let timeout_ms = if config.timeout_ms == 0 {
483            DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS
484        } else {
485            config.timeout_ms
486        };
487
488        let max_batch_size = if config.max_batch_size == 0 {
489            DEFAULT_MAX_BATCH_SIZE
490        } else {
491            config.max_batch_size
492        };
493
494        let api_key_env = normalize_api_key(config.api_key_env.clone());
495        let model = config.model.clone();
496
497        let client = Client::builder()
498            .timeout(Duration::from_millis(timeout_ms))
499            .redirect(reqwest::redirect::Policy::none())
500            .build()
501            .map_err(|error| format!("failed to configure embedding client: {error}"))?;
502
503        let engine = match config.backend {
504            SemanticBackend::Fastembed => {
505                SemanticEmbeddingEngine::Local(LocalEmbedder::new(&model)?)
506            }
507            SemanticBackend::OpenAiCompatible => {
508                let raw = config.base_url.as_ref().ok_or_else(|| {
509                    "base_url is required for openai_compatible backend".to_string()
510                })?;
511                let base_url = normalize_base_url(raw)?;
512
513                let api_key = match api_key_env {
514                    Some(var_name) => Some(env::var(&var_name).map_err(|_| {
515                        format!("missing api_key_env '{var_name}' for openai_compatible backend")
516                    })?),
517                    None => None,
518                };
519
520                SemanticEmbeddingEngine::OpenAiCompatible {
521                    client,
522                    model,
523                    base_url,
524                    api_key,
525                }
526            }
527            SemanticBackend::Ollama => {
528                let raw = config
529                    .base_url
530                    .as_ref()
531                    .ok_or_else(|| "base_url is required for ollama backend".to_string())?;
532                let base_url = normalize_base_url(raw)?;
533
534                SemanticEmbeddingEngine::Ollama {
535                    client,
536                    model,
537                    base_url,
538                }
539            }
540        };
541
542        Ok(Self {
543            backend: config.backend,
544            model: config.model.clone(),
545            base_url: config.base_url.clone(),
546            timeout_ms,
547            max_batch_size,
548            dimension: None,
549            engine,
550            query_embedding_cache: HashMap::new(),
551            query_embedding_cache_order: VecDeque::new(),
552            query_embedding_cache_hits: 0,
553            query_embedding_cache_misses: 0,
554        })
555    }
556
557    pub fn backend(&self) -> SemanticBackend {
558        self.backend
559    }
560
561    pub fn model(&self) -> &str {
562        &self.model
563    }
564
565    pub fn base_url(&self) -> Option<&str> {
566        self.base_url.as_deref()
567    }
568
569    pub fn max_batch_size(&self) -> usize {
570        self.max_batch_size
571    }
572
573    pub fn timeout_ms(&self) -> u64 {
574        self.timeout_ms
575    }
576
577    pub fn fingerprint(
578        &mut self,
579        config: &SemanticBackendConfig,
580    ) -> Result<SemanticIndexFingerprint, String> {
581        let dimension = self.dimension()?;
582        Ok(SemanticIndexFingerprint::from_config(config, dimension))
583    }
584
585    pub fn dimension(&mut self) -> Result<usize, String> {
586        if let Some(dimension) = self.dimension {
587            return Ok(dimension);
588        }
589
590        let dimension = match &mut self.engine {
591            SemanticEmbeddingEngine::Local(model) => {
592                let vectors = model.embed(&["semantic index fingerprint probe".to_string()])?;
593                vectors
594                    .first()
595                    .map(|v| v.len())
596                    .ok_or_else(|| "embedding backend returned no vectors".to_string())?
597            }
598            SemanticEmbeddingEngine::OpenAiCompatible { .. } => {
599                let vectors =
600                    self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
601                vectors
602                    .first()
603                    .map(|v| v.len())
604                    .ok_or_else(|| "embedding backend returned no vectors".to_string())?
605            }
606            SemanticEmbeddingEngine::Ollama { .. } => {
607                let vectors =
608                    self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
609                vectors
610                    .first()
611                    .map(|v| v.len())
612                    .ok_or_else(|| "embedding backend returned no vectors".to_string())?
613            }
614        };
615
616        self.dimension = Some(dimension);
617        Ok(dimension)
618    }
619
620    pub fn embed(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
621        self.embed_texts(texts)
622    }
623
624    pub fn embed_query_cached(&mut self, query: &str) -> Result<Vec<f32>, String> {
625        if let Some(vector) = self.query_embedding_cache.get(query) {
626            self.query_embedding_cache_hits += 1;
627            return Ok(vector.clone());
628        }
629
630        self.query_embedding_cache_misses += 1;
631        let embeddings = self.embed_texts(vec![query.to_string()])?;
632        let vector = embeddings
633            .first()
634            .cloned()
635            .ok_or_else(|| "embedding model returned no query vector".to_string())?;
636
637        if self.query_embedding_cache.len() >= QUERY_EMBEDDING_CACHE_CAP {
638            if let Some(oldest) = self.query_embedding_cache_order.pop_front() {
639                self.query_embedding_cache.remove(&oldest);
640            }
641        }
642        self.query_embedding_cache
643            .insert(query.to_string(), vector.clone());
644        self.query_embedding_cache_order
645            .push_back(query.to_string());
646
647        Ok(vector)
648    }
649
650    pub fn query_embedding_cache_stats(&self) -> (u64, u64, usize) {
651        (
652            self.query_embedding_cache_hits,
653            self.query_embedding_cache_misses,
654            self.query_embedding_cache.len(),
655        )
656    }
657
658    fn embed_texts(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
659        match &mut self.engine {
660            SemanticEmbeddingEngine::Local(model) => model
661                .embed(&texts)
662                .map_err(|error| format!("failed to embed batch: {error}")),
663            SemanticEmbeddingEngine::OpenAiCompatible {
664                client,
665                model,
666                base_url,
667                api_key,
668            } => {
669                let expected_text_count = texts.len();
670                let endpoint = build_openai_embeddings_endpoint(base_url);
671                let body = serde_json::json!({
672                    "input": texts,
673                    "model": model,
674                });
675
676                let raw = send_embedding_request(
677                    || {
678                        // `.json(&body)` sets Content-Type: application/json
679                        // automatically. Do NOT add `.header("Content-Type",
680                        // "application/json")` afterwards — RequestBuilder::header()
681                        // calls HeaderMap::append, which produces TWO Content-Type
682                        // headers on the wire. OpenAI's /v1/embeddings endpoint
683                        // treats duplicate Content-Type as malformed and rejects
684                        // the body with 400 "you must provide a model parameter"
685                        // even when `model` is set. Verified end-to-end against
686                        // api.openai.com. See issue #36.
687                        let mut request = client.post(&endpoint).json(&body);
688
689                        if let Some(api_key) = api_key {
690                            request = request.header("Authorization", format!("Bearer {api_key}"));
691                        }
692
693                        request
694                    },
695                    "openai compatible",
696                )?;
697
698                #[derive(Deserialize)]
699                struct OpenAiResponse {
700                    data: Vec<OpenAiEmbeddingResult>,
701                }
702
703                #[derive(Deserialize)]
704                struct OpenAiEmbeddingResult {
705                    embedding: Vec<f32>,
706                    index: Option<u32>,
707                }
708
709                let parsed: OpenAiResponse = serde_json::from_str(&raw)
710                    .map_err(|error| format!("invalid openai compatible response: {error}"))?;
711                if parsed.data.len() != expected_text_count {
712                    return Err(format!(
713                        "openai compatible response returned {} embeddings for {} inputs",
714                        parsed.data.len(),
715                        expected_text_count
716                    ));
717                }
718
719                let mut vectors = vec![Vec::new(); parsed.data.len()];
720                for (i, item) in parsed.data.into_iter().enumerate() {
721                    let index = item.index.unwrap_or(i as u32) as usize;
722                    if index >= vectors.len() {
723                        return Err(
724                            "openai compatible response contains invalid vector index".to_string()
725                        );
726                    }
727                    vectors[index] = item.embedding;
728                }
729
730                for vector in &vectors {
731                    if vector.is_empty() {
732                        return Err(
733                            "openai compatible response contained missing vectors".to_string()
734                        );
735                    }
736                }
737
738                self.dimension = vectors.first().map(Vec::len);
739                Ok(vectors)
740            }
741            SemanticEmbeddingEngine::Ollama {
742                client,
743                model,
744                base_url,
745            } => {
746                let expected_text_count = texts.len();
747                let endpoint = build_ollama_embeddings_endpoint(base_url);
748
749                #[derive(Serialize)]
750                struct OllamaPayload<'a> {
751                    model: &'a str,
752                    input: Vec<String>,
753                }
754
755                let payload = OllamaPayload {
756                    model,
757                    input: texts,
758                };
759
760                let raw = send_embedding_request(
761                    || {
762                        // `.json(&payload)` sets Content-Type automatically.
763                        // Same duplicate-header trap as the OpenAI branch above
764                        // — most Ollama servers tolerate it, but the
765                        // single-Content-Type form is the correct one.
766                        client.post(&endpoint).json(&payload)
767                    },
768                    "ollama",
769                )?;
770
771                #[derive(Deserialize)]
772                struct OllamaResponse {
773                    embeddings: Vec<Vec<f32>>,
774                }
775
776                let parsed: OllamaResponse = serde_json::from_str(&raw)
777                    .map_err(|error| format!("invalid ollama response: {error}"))?;
778                if parsed.embeddings.is_empty() {
779                    return Err("ollama response returned no embeddings".to_string());
780                }
781                if parsed.embeddings.len() != expected_text_count {
782                    return Err(format!(
783                        "ollama response returned {} embeddings for {} inputs",
784                        parsed.embeddings.len(),
785                        expected_text_count
786                    ));
787                }
788
789                let vectors = parsed.embeddings;
790                for vector in &vectors {
791                    if vector.is_empty() {
792                        return Err("ollama response contained empty embeddings".to_string());
793                    }
794                }
795
796                self.dimension = vectors.first().map(Vec::len);
797                Ok(vectors)
798            }
799        }
800    }
801}
802
803/// Pre-validate ONNX Runtime by attempting a raw dlopen before ort touches it.
804/// This catches broken/incompatible .so files without risking a panic in the ort crate.
805/// Also checks the runtime version via OrtGetApiBase if available.
806pub fn pre_validate_onnx_runtime() -> Result<(), String> {
807    let dylib_path = std::env::var("ORT_DYLIB_PATH").ok();
808
809    #[cfg(any(target_os = "linux", target_os = "macos"))]
810    {
811        #[cfg(target_os = "linux")]
812        let default_name = "libonnxruntime.so";
813        #[cfg(target_os = "macos")]
814        let default_name = "libonnxruntime.dylib";
815
816        let lib_name = dylib_path.as_deref().unwrap_or(default_name);
817
818        unsafe {
819            let c_name = std::ffi::CString::new(lib_name)
820                .map_err(|e| format!("invalid library path: {}", e))?;
821            let handle = libc::dlopen(c_name.as_ptr(), libc::RTLD_NOW);
822            if handle.is_null() {
823                let err = libc::dlerror();
824                let msg = if err.is_null() {
825                    "unknown dlopen error".to_string()
826                } else {
827                    std::ffi::CStr::from_ptr(err).to_string_lossy().into_owned()
828                };
829                return Err(format!(
830                    "ONNX Runtime not found. dlopen('{}') failed: {}. \
831                     Run `npx @cortexkit/aft doctor` to diagnose.",
832                    lib_name, msg
833                ));
834            }
835
836            // Try to detect the runtime version from the file path or soname.
837            // libonnxruntime.so.1.19.0, libonnxruntime.1.24.4.dylib, etc.
838            let detected_version = detect_ort_version_from_path(lib_name);
839
840            libc::dlclose(handle);
841
842            // Check version compatibility — we need 1.24.x
843            if let Some(ref version) = detected_version {
844                let parts: Vec<&str> = version.split('.').collect();
845                if let (Some(major), Some(minor)) = (
846                    parts.first().and_then(|s| s.parse::<u32>().ok()),
847                    parts.get(1).and_then(|s| s.parse::<u32>().ok()),
848                ) {
849                    if major != 1 || minor < 20 {
850                        return Err(format_ort_version_mismatch(version, lib_name));
851                    }
852                }
853            }
854        }
855    }
856
857    #[cfg(target_os = "windows")]
858    {
859        // Validate ONNX Runtime availability on Windows by loading the DLL
860        // via LoadLibraryExW before the ort crate attempts its own LoadLibrary.
861        // This way we can produce a friendly error (with installation hints)
862        // instead of a raw LoadLibrary failure from deep inside fastembed.
863        let lib_name = dylib_path.as_deref().unwrap_or("onnxruntime.dll");
864
865        // Use kernel32 LoadLibraryExW for the validation — built-in, no
866        // crate dependency required. GetModuleFileNameW resolves the loaded
867        // DLL path for version probing via the version.dll API.
868        #[link(name = "kernel32")]
869        extern "system" {
870            fn LoadLibraryExW(
871                lpLibFileName: *const u16,
872                hFile: *mut std::ffi::c_void,
873                dwFlags: u32,
874            ) -> *mut std::ffi::c_void;
875            fn FreeLibrary(hLibModule: *mut std::ffi::c_void) -> i32;
876            fn GetModuleFileNameW(
877                hModule: *mut std::ffi::c_void,
878                lpFilename: *mut u16,
879                nSize: u32,
880            ) -> u32;
881        }
882
883        #[link(name = "version")]
884        extern "system" {
885            fn GetFileVersionInfoSizeW(lptstrFilename: *const u16, lpdwHandle: *mut u32) -> u32;
886            fn GetFileVersionInfoW(
887                lptstrFilename: *const u16,
888                dwHandle: u32,
889                dwLen: u32,
890                lpData: *mut std::ffi::c_void,
891            ) -> i32;
892            fn VerQueryValueW(
893                pBlock: *mut std::ffi::c_void,
894                lpSubBlock: *const u16,
895                lplpBuffer: *mut *mut std::ffi::c_void,
896                puLen: *mut u32,
897            ) -> i32;
898        }
899
900        #[repr(C)]
901        struct VS_FIXEDFILEINFO {
902            dw_signature: u32,
903            dw_struc_version: u32,
904            dw_file_version_ms: u32, // HIWORD major, LOWORD minor
905            dw_file_version_ls: u32, // HIWORD build, LOWORD revision
906            dw_product_version_ms: u32,
907            dw_product_version_ls: u32,
908            dw_file_flags_mask: u32,
909            dw_file_flags: u32,
910            dw_file_os: u32,
911            dw_file_type: u32,
912            dw_file_subtype: u32,
913            dw_file_date_ms: u32,
914            dw_file_date_ls: u32,
915        }
916
917        unsafe {
918            use std::os::windows::ffi::OsStrExt;
919            let wide: Vec<u16> = std::ffi::OsStr::new(lib_name)
920                .encode_wide()
921                .chain(std::iter::once(0))
922                .collect();
923
924            let handle = LoadLibraryExW(wide.as_ptr(), std::ptr::null_mut(), 0);
925            if handle.is_null() {
926                let err = std::io::Error::last_os_error();
927                return Err(format!(
928                    "ONNX Runtime not found. LoadLibraryExW('{}') failed: {}. \
929                     Run `npx @cortexkit/aft doctor` to diagnose.",
930                    lib_name, err
931                ));
932            }
933
934            // Probe the file version from PE resources so we can reject
935            // outdated DLLs (e.g. v1.9.x) before the ort crate panics.
936            let mut detected_major: u32 = 0;
937            let mut detected_minor: u32 = 0;
938            // Use MAX_UNICODEPATH (32767) so deeply nested ORT paths (e.g.
939            // long NuGet package paths under %USERPROFILE%) never truncate.
940            // GetModuleFileNameW truncates silently when the buffer is too
941            // small, which causes version probing to fail and the version
942            // check to be bypassed — better to allocate generously.
943            let mut path_buf = [0u16; 32767];
944            let path_len = GetModuleFileNameW(handle, path_buf.as_mut_ptr(), 32767);
945            if path_len > 0 {
946                let mut dummy_handle: u32 = 0;
947                let info_size = GetFileVersionInfoSizeW(path_buf.as_ptr(), &mut dummy_handle);
948                if info_size > 0 {
949                    let mut info = vec![0u8; info_size as usize];
950                    if GetFileVersionInfoW(
951                        path_buf.as_ptr(),
952                        0,
953                        info_size,
954                        info.as_mut_ptr() as *mut std::ffi::c_void,
955                    ) != 0
956                    {
957                        let sub_block = "\\\0".encode_utf16().collect::<Vec<u16>>();
958                        let mut vs_info: *mut std::ffi::c_void = std::ptr::null_mut();
959                        let mut vs_len: u32 = 0;
960                        if VerQueryValueW(
961                            info.as_mut_ptr() as *mut std::ffi::c_void,
962                            sub_block.as_ptr(),
963                            &mut vs_info,
964                            &mut vs_len,
965                        ) != 0
966                            && !vs_info.is_null()
967                        {
968                            let fixed = vs_info as *const VS_FIXEDFILEINFO;
969                            detected_major = (*fixed).dw_file_version_ms >> 16;
970                            detected_minor = (*fixed).dw_file_version_ms & 0xFFFF;
971                        }
972                    }
973                }
974            }
975
976            FreeLibrary(handle);
977
978            // Version compatibility check (mirrors the Linux/macOS path).
979            // If version could not be detected (detected_major == 0) we let
980            // the load succeed — the ort crate will diagnose further.
981            if detected_major != 0 && (detected_major != 1 || detected_minor < 20) {
982                let ver = format!("{}.{}", detected_major, detected_minor);
983                return Err(format_ort_version_mismatch(&ver, lib_name));
984            }
985        }
986    }
987
988    Ok(())
989}
990
991/// Try to extract the ORT version from the library filename or resolved symlink.
992/// Examples: "libonnxruntime.so.1.19.0" → "1.19.0", "libonnxruntime.1.24.4.dylib" → "1.24.4"
993#[cfg(any(target_os = "linux", target_os = "macos"))]
994fn detect_ort_version_from_path(lib_path: &str) -> Option<String> {
995    let path = std::path::Path::new(lib_path);
996
997    // Try the path as given, then follow symlinks
998    for candidate in [Some(path.to_path_buf()), std::fs::canonicalize(path).ok()]
999        .into_iter()
1000        .flatten()
1001    {
1002        if let Some(name) = candidate.file_name().and_then(|n| n.to_str()) {
1003            if let Some(version) = extract_version_from_filename(name) {
1004                return Some(version);
1005            }
1006        }
1007    }
1008
1009    // Also check for versioned siblings in the same directory
1010    if let Some(parent) = path.parent() {
1011        if let Ok(entries) = std::fs::read_dir(parent) {
1012            for entry in entries.flatten() {
1013                if let Some(name) = entry.file_name().to_str() {
1014                    if name.starts_with("libonnxruntime") {
1015                        if let Some(version) = extract_version_from_filename(name) {
1016                            return Some(version);
1017                        }
1018                    }
1019                }
1020            }
1021        }
1022    }
1023
1024    None
1025}
1026
1027/// Extract version from filenames like "libonnxruntime.so.1.19.0" or "libonnxruntime.1.24.4.dylib"
1028#[cfg(any(target_os = "linux", target_os = "macos"))]
1029fn extract_version_from_filename(name: &str) -> Option<String> {
1030    // Match patterns: .so.X.Y.Z or .X.Y.Z.dylib or .X.Y.Z.so
1031    let re = regex::Regex::new(r"(\d+\.\d+\.\d+)").ok()?;
1032    re.find(name).map(|m| m.as_str().to_string())
1033}
1034
1035fn suggest_removal_command(lib_path: &str) -> String {
1036    if lib_path.starts_with("/usr/local/lib")
1037        || lib_path == "libonnxruntime.so"
1038        || lib_path == "libonnxruntime.dylib"
1039    {
1040        #[cfg(target_os = "linux")]
1041        return "   sudo rm /usr/local/lib/libonnxruntime* && sudo ldconfig".to_string();
1042        #[cfg(target_os = "macos")]
1043        return "   sudo rm /usr/local/lib/libonnxruntime*".to_string();
1044    }
1045    format!("   rm '{}'", lib_path)
1046}
1047
1048/// Build the user-facing error message for an incompatible ONNX Runtime
1049/// install. Extracted as a pure helper so we can unit-test the wording
1050/// stability — the auto-fix recommendation must always come first because
1051/// it's the only safe option, and the system-rm step must remain present
1052/// because some users prefer the system-wide cleanup path.
1053pub(crate) fn format_ort_version_mismatch(version: &str, lib_name: &str) -> String {
1054    format!(
1055        "ONNX Runtime version mismatch: found v{} at '{}', but AFT requires v1.20+. \
1056         Solutions:\n\
1057         1. Auto-fix (recommended): run `npx @cortexkit/aft doctor --fix`. \
1058         This downloads AFT-managed ONNX Runtime v1.24 into AFT's storage and \
1059         configures the bridge to load it instead of the system library — no \
1060         changes to '{}'.\n\
1061         2. Remove the old library and restart (AFT auto-downloads the correct version on next start):\n\
1062         {}\n\
1063         3. Or install ONNX Runtime 1.24 system-wide: https://github.com/microsoft/onnxruntime/releases/tag/v1.24.0\n\
1064         4. Run `npx @cortexkit/aft doctor` for full diagnostics.",
1065        version,
1066        lib_name,
1067        lib_name,
1068        suggest_removal_command(lib_name),
1069    )
1070}
1071
1072pub fn is_onnx_runtime_unavailable(message: &str) -> bool {
1073    if message.trim_start().starts_with("ONNX Runtime not found.") {
1074        return true;
1075    }
1076
1077    let message = message.to_ascii_lowercase();
1078    let mentions_onnx_runtime = ["onnx runtime", "onnxruntime", "libonnxruntime"]
1079        .iter()
1080        .any(|pattern| message.contains(pattern));
1081    let mentions_dynamic_load_failure = [
1082        "shared library",
1083        "dynamic library",
1084        "failed to load",
1085        "could not load",
1086        "unable to load",
1087        "dlopen",
1088        "loadlibrary",
1089        "no such file",
1090        "not found",
1091    ]
1092    .iter()
1093    .any(|pattern| message.contains(pattern));
1094
1095    mentions_onnx_runtime && mentions_dynamic_load_failure
1096}
1097
1098pub fn format_embedding_init_error(error: impl Display) -> String {
1099    let message = error.to_string();
1100
1101    if is_onnx_runtime_unavailable(&message) {
1102        return format!("{ONNX_RUNTIME_INSTALL_HINT} Original error: {message}");
1103    }
1104
1105    format!("failed to initialize semantic embedding model: {message}")
1106}
1107
1108/// A chunk of code ready for embedding — derived from a Symbol with context enrichment
1109#[derive(Debug, Clone)]
1110pub struct SemanticChunk {
1111    /// Absolute file path
1112    pub file: PathBuf,
1113    /// Symbol name
1114    pub name: String,
1115    /// Symbol kind (function, class, struct, etc.)
1116    pub kind: SymbolKind,
1117    /// Line range (0-based internally, inclusive)
1118    pub start_line: u32,
1119    pub end_line: u32,
1120    /// Whether the symbol is exported
1121    pub exported: bool,
1122    /// The enriched text that gets embedded (scope + signature + body snippet)
1123    pub embed_text: String,
1124    /// Short code snippet for display in results
1125    pub snippet: String,
1126}
1127
1128/// A stored embedding entry — chunk metadata + vector
1129#[derive(Debug, Clone)]
1130pub struct EmbeddingEntry {
1131    chunk: SemanticChunk,
1132    vector: Vec<f32>,
1133}
1134
1135/// The semantic index — stores embeddings for all symbols in a project
1136#[derive(Debug, Clone)]
1137pub struct SemanticIndex {
1138    entries: Vec<EmbeddingEntry>,
1139    /// Track which files are indexed and their mtime for staleness detection
1140    file_mtimes: HashMap<PathBuf, SystemTime>,
1141    /// Track indexed file sizes alongside mtimes for staleness detection
1142    file_sizes: HashMap<PathBuf, u64>,
1143    file_hashes: HashMap<PathBuf, blake3::Hash>,
1144    /// Embedding dimension (384 for MiniLM-L6-v2)
1145    dimension: usize,
1146    fingerprint: Option<SemanticIndexFingerprint>,
1147    project_root: PathBuf,
1148    deferred_files: HashSet<PathBuf>,
1149}
1150
1151#[derive(Debug, Clone, Copy)]
1152struct IndexedFileMetadata {
1153    mtime: SystemTime,
1154    size: u64,
1155    content_hash: blake3::Hash,
1156}
1157
1158/// Result of an incremental refresh of the semantic index. Counts are file
1159/// counts; `total_processed` is the number of current/deleted files considered.
1160#[derive(Debug, Default, Clone, Copy)]
1161pub struct RefreshSummary {
1162    pub changed: usize,
1163    pub added: usize,
1164    pub deleted: usize,
1165    pub total_processed: usize,
1166}
1167
1168impl RefreshSummary {
1169    /// True when no files were touched.
1170    pub fn is_noop(&self) -> bool {
1171        self.changed == 0 && self.added == 0 && self.deleted == 0
1172    }
1173}
1174
1175#[derive(Debug, Default)]
1176pub struct InvalidatedFilesRefresh {
1177    pub added_entries: Vec<EmbeddingEntry>,
1178    pub updated_metadata: Vec<(PathBuf, FileFreshness)>,
1179    pub completed_paths: Vec<PathBuf>,
1180    pub summary: RefreshSummary,
1181}
1182
1183/// Search result from a semantic query
1184#[derive(Debug, Clone)]
1185pub struct SemanticResult {
1186    pub file: PathBuf,
1187    pub name: String,
1188    pub kind: SymbolKind,
1189    pub start_line: u32,
1190    pub end_line: u32,
1191    pub exported: bool,
1192    pub snippet: String,
1193    pub score: f32,
1194    pub source: &'static str,
1195}
1196
1197impl SemanticIndex {
1198    pub fn new(project_root: PathBuf, dimension: usize) -> Self {
1199        debug_assert!(project_root.is_absolute());
1200        Self {
1201            entries: Vec::new(),
1202            file_mtimes: HashMap::new(),
1203            file_sizes: HashMap::new(),
1204            file_hashes: HashMap::new(),
1205            dimension,
1206            fingerprint: None,
1207            project_root,
1208            deferred_files: HashSet::new(),
1209        }
1210    }
1211
1212    /// Number of embedded symbol entries.
1213    pub fn entry_count(&self) -> usize {
1214        self.entries.len()
1215    }
1216
1217    /// Number of files currently tracked by the semantic index.
1218    pub fn indexed_file_count(&self) -> usize {
1219        self.file_mtimes.len()
1220    }
1221
1222    /// Human-readable status label for the index.
1223    pub fn status_label(&self) -> &'static str {
1224        if self.entries.is_empty() {
1225            "empty"
1226        } else {
1227            "ready"
1228        }
1229    }
1230
1231    fn collect_chunks(
1232        project_root: &Path,
1233        files: &[PathBuf],
1234    ) -> (Vec<SemanticChunk>, HashMap<PathBuf, IndexedFileMetadata>) {
1235        let collect_started = std::time::Instant::now();
1236        let per_file: Vec<(
1237            PathBuf,
1238            Result<(IndexedFileMetadata, Vec<SemanticChunk>), String>,
1239        )> = files
1240            .par_iter()
1241            .map_init(HashMap::new, |parsers, file| {
1242                let result = collect_file_metadata(file).and_then(|metadata| {
1243                    collect_file_chunks(project_root, file, parsers)
1244                        .map(|chunks| (metadata, chunks))
1245                });
1246                (file.clone(), result)
1247            })
1248            .collect();
1249
1250        let mut chunks: Vec<SemanticChunk> = Vec::new();
1251        let mut file_metadata: HashMap<PathBuf, IndexedFileMetadata> = HashMap::new();
1252
1253        for (file, result) in per_file {
1254            match result {
1255                Ok((metadata, file_chunks)) => {
1256                    file_metadata.insert(file, metadata);
1257                    chunks.extend(file_chunks);
1258                }
1259                Err(error) => {
1260                    // "unsupported file extension" is expected for non-code files
1261                    // (json, xml, .gitignore, etc.) that get included in the
1262                    // project walk. Pre-fix this was swallowed by .unwrap_or_default();
1263                    // we now skip silently to keep the log clean. Only real read/parse
1264                    // errors are worth surfacing.
1265                    if error == "unsupported file extension" {
1266                        continue;
1267                    }
1268                    slog_warn!(
1269                        "failed to collect semantic chunks for {}: {}",
1270                        file.display(),
1271                        error
1272                    );
1273                }
1274            }
1275        }
1276
1277        slog_info!(
1278            "semantic collect: {} chunks from {} files in {} ms",
1279            chunks.len(),
1280            file_metadata.len(),
1281            collect_started.elapsed().as_millis()
1282        );
1283
1284        (chunks, file_metadata)
1285    }
1286
1287    fn build_from_chunks<F, P>(
1288        project_root: &Path,
1289        chunks: Vec<SemanticChunk>,
1290        file_metadata: HashMap<PathBuf, IndexedFileMetadata>,
1291        embed_fn: &mut F,
1292        max_batch_size: usize,
1293        mut progress: Option<&mut P>,
1294    ) -> Result<Self, String>
1295    where
1296        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1297        P: FnMut(usize, usize),
1298    {
1299        debug_assert!(project_root.is_absolute());
1300        let total_chunks = chunks.len();
1301
1302        if chunks.is_empty() {
1303            return Ok(Self {
1304                entries: Vec::new(),
1305                file_mtimes: file_metadata
1306                    .iter()
1307                    .map(|(path, metadata)| (path.clone(), metadata.mtime))
1308                    .collect(),
1309                file_sizes: file_metadata
1310                    .iter()
1311                    .map(|(path, metadata)| (path.clone(), metadata.size))
1312                    .collect(),
1313                file_hashes: file_metadata
1314                    .into_iter()
1315                    .map(|(path, metadata)| (path, metadata.content_hash))
1316                    .collect(),
1317                dimension: DEFAULT_DIMENSION,
1318                fingerprint: None,
1319                project_root: project_root.to_path_buf(),
1320                deferred_files: HashSet::new(),
1321            });
1322        }
1323
1324        // Embed in batches
1325        let mut entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1326        let mut expected_dimension: Option<usize> = None;
1327        let batch_size = max_batch_size.max(1);
1328        let embed_started = std::time::Instant::now();
1329        let batch_count = total_chunks.div_ceil(batch_size);
1330        for batch_start in (0..chunks.len()).step_by(batch_size) {
1331            let batch_end = (batch_start + batch_size).min(chunks.len());
1332            let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1333                .iter()
1334                .map(|c| c.embed_text.clone())
1335                .collect();
1336
1337            let vectors = embed_fn(batch_texts)?;
1338            validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1339
1340            // Track consistent dimension across all batches
1341            if let Some(dim) = vectors.first().map(|v| v.len()) {
1342                match expected_dimension {
1343                    None => expected_dimension = Some(dim),
1344                    Some(expected) if dim != expected => {
1345                        return Err(format!(
1346                            "embedding dimension changed across batches: expected {expected}, got {dim}"
1347                        ));
1348                    }
1349                    _ => {}
1350                }
1351            }
1352
1353            for (i, vector) in vectors.into_iter().enumerate() {
1354                let chunk_idx = batch_start + i;
1355                entries.push(EmbeddingEntry {
1356                    chunk: chunks[chunk_idx].clone(),
1357                    vector,
1358                });
1359            }
1360
1361            if let Some(callback) = progress.as_mut() {
1362                callback(entries.len(), total_chunks);
1363            }
1364        }
1365
1366        let embed_ms = embed_started.elapsed().as_millis();
1367        let rate = (total_chunks as u128 * 1000)
1368            .checked_div(embed_ms)
1369            .unwrap_or(0) as u64;
1370        slog_info!(
1371            "semantic embed: {} chunks in {} batches, {} ms ({} chunks/s)",
1372            total_chunks,
1373            batch_count,
1374            embed_ms,
1375            rate
1376        );
1377
1378        let dimension = entries
1379            .first()
1380            .map(|e| e.vector.len())
1381            .unwrap_or(DEFAULT_DIMENSION);
1382
1383        Ok(Self {
1384            entries,
1385            file_mtimes: file_metadata
1386                .iter()
1387                .map(|(path, metadata)| (path.clone(), metadata.mtime))
1388                .collect(),
1389            file_sizes: file_metadata
1390                .iter()
1391                .map(|(path, metadata)| (path.clone(), metadata.size))
1392                .collect(),
1393            file_hashes: file_metadata
1394                .into_iter()
1395                .map(|(path, metadata)| (path, metadata.content_hash))
1396                .collect(),
1397            dimension,
1398            fingerprint: None,
1399            project_root: project_root.to_path_buf(),
1400            deferred_files: HashSet::new(),
1401        })
1402    }
1403
1404    /// Build the semantic index from a set of files using the provided embedding function.
1405    /// `embed_fn` takes a batch of texts and returns a batch of embedding vectors.
1406    pub fn build<F>(
1407        project_root: &Path,
1408        files: &[PathBuf],
1409        embed_fn: &mut F,
1410        max_batch_size: usize,
1411    ) -> Result<Self, String>
1412    where
1413        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1414    {
1415        let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1416        Self::build_from_chunks(
1417            project_root,
1418            chunks,
1419            file_mtimes,
1420            embed_fn,
1421            max_batch_size,
1422            Option::<&mut fn(usize, usize)>::None,
1423        )
1424    }
1425
1426    /// Build the semantic index and report embedding progress using entry counts.
1427    pub fn build_with_progress<F, P>(
1428        project_root: &Path,
1429        files: &[PathBuf],
1430        embed_fn: &mut F,
1431        max_batch_size: usize,
1432        progress: &mut P,
1433    ) -> Result<Self, String>
1434    where
1435        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1436        P: FnMut(usize, usize),
1437    {
1438        let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1439        let total_chunks = chunks.len();
1440        progress(0, total_chunks);
1441        Self::build_from_chunks(
1442            project_root,
1443            chunks,
1444            file_mtimes,
1445            embed_fn,
1446            max_batch_size,
1447            Some(progress),
1448        )
1449    }
1450
1451    /// Incrementally refresh entries for changed/new files only, preserving cached
1452    /// embeddings for unchanged files. Used when loading the index from disk and
1453    /// finding that a small fraction of files have moved on, deleted, or appeared.
1454    ///
1455    /// Returns `RefreshSummary` describing what changed. On success, `self` is
1456    /// mutated in place and remains a valid index.
1457    ///
1458    /// `current_files` is the full set of files the project considers indexable
1459    /// (typically `walk_project_files(...)`). Files in the cache that are no
1460    /// longer in this set are treated as deleted.
1461    pub fn refresh_stale_files<F, P>(
1462        &mut self,
1463        project_root: &Path,
1464        current_files: &[PathBuf],
1465        embed_fn: &mut F,
1466        max_batch_size: usize,
1467        progress: &mut P,
1468    ) -> Result<RefreshSummary, String>
1469    where
1470        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1471        P: FnMut(usize, usize),
1472    {
1473        self.backfill_missing_file_sizes();
1474
1475        // 1. Bucket files into deleted / changed / added.
1476        let current_set: HashSet<&Path> = current_files.iter().map(PathBuf::as_path).collect();
1477        self.deferred_files
1478            .retain(|path| current_set.contains(path.as_path()));
1479        let total_processed = current_set.len() + self.file_mtimes.len()
1480            - self
1481                .file_mtimes
1482                .keys()
1483                .filter(|path| current_set.contains(path.as_path()))
1484                .count();
1485
1486        // Files in cache that disappeared from disk OR are no longer in the
1487        // walked set. Both cases need their entries dropped.
1488        let mut deleted: Vec<PathBuf> = Vec::new();
1489        let mut changed: Vec<PathBuf> = Vec::new();
1490        let indexed_paths: Vec<PathBuf> = self.file_mtimes.keys().cloned().collect();
1491        for indexed_path in &indexed_paths {
1492            if !current_set.contains(indexed_path.as_path()) {
1493                deleted.push(indexed_path.clone());
1494                continue;
1495            }
1496            let cached = match (
1497                self.file_mtimes.get(indexed_path),
1498                self.file_sizes.get(indexed_path),
1499                self.file_hashes.get(indexed_path),
1500            ) {
1501                (Some(mtime), Some(size), Some(hash)) => Some(FileFreshness {
1502                    mtime: *mtime,
1503                    size: *size,
1504                    content_hash: *hash,
1505                }),
1506                _ => None,
1507            };
1508            match cached
1509                .map(|freshness| cache_freshness::verify_file_strict(indexed_path, &freshness))
1510            {
1511                Some(FreshnessVerdict::HotFresh) => {}
1512                Some(FreshnessVerdict::ContentFresh {
1513                    new_mtime,
1514                    new_size,
1515                }) => {
1516                    self.file_mtimes.insert(indexed_path.clone(), new_mtime);
1517                    self.file_sizes.insert(indexed_path.clone(), new_size);
1518                }
1519                Some(FreshnessVerdict::Stale | FreshnessVerdict::Deleted) | None => {
1520                    changed.push(indexed_path.clone());
1521                }
1522            }
1523        }
1524
1525        // Files in walk that were never indexed.
1526        let mut added: Vec<PathBuf> = Vec::new();
1527        for path in current_files {
1528            if !self.file_mtimes.contains_key(path) {
1529                added.push(path.clone());
1530            }
1531        }
1532
1533        // Fast path: nothing to do.
1534        if deleted.is_empty() && changed.is_empty() && added.is_empty() {
1535            progress(0, 0);
1536            return Ok(RefreshSummary {
1537                total_processed,
1538                ..RefreshSummary::default()
1539            });
1540        }
1541
1542        // 2. Drop entries for deleted files immediately. Changed files are only
1543        //    replaced after successful re-extraction + embedding so transient
1544        //    read/parse errors keep the stale-but-valid cache entry.
1545        if !deleted.is_empty() {
1546            self.remove_indexed_files(&deleted);
1547        }
1548
1549        // 3. Embed the changed + added set, if any.
1550        let mut to_embed: Vec<PathBuf> = Vec::with_capacity(changed.len() + added.len());
1551        to_embed.extend(changed.iter().cloned());
1552        to_embed.extend(added.iter().cloned());
1553
1554        if to_embed.is_empty() {
1555            // Only deletions happened.
1556            progress(0, 0);
1557            return Ok(RefreshSummary {
1558                changed: 0,
1559                added: 0,
1560                deleted: deleted.len(),
1561                total_processed,
1562            });
1563        }
1564
1565        let (chunks, fresh_metadata) = Self::collect_chunks(project_root, &to_embed);
1566        let changed_set: HashSet<&Path> = changed.iter().map(PathBuf::as_path).collect();
1567        let vanished = to_embed
1568            .iter()
1569            .filter(|path| {
1570                changed_set.contains(path.as_path())
1571                    && !fresh_metadata.contains_key(*path)
1572                    && !path.exists()
1573            })
1574            .cloned()
1575            .collect::<Vec<_>>();
1576        if !vanished.is_empty() {
1577            self.remove_indexed_files(&vanished);
1578            deleted.extend(vanished);
1579        }
1580
1581        if chunks.is_empty() {
1582            progress(0, 0);
1583            let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1584            for file in &successful_files {
1585                self.deferred_files.remove(file);
1586            }
1587            if !successful_files.is_empty() {
1588                self.entries
1589                    .retain(|entry| !successful_files.contains(&entry.chunk.file));
1590            }
1591            let changed_count = changed
1592                .iter()
1593                .filter(|path| successful_files.contains(*path))
1594                .count();
1595            let added_count = added
1596                .iter()
1597                .filter(|path| successful_files.contains(*path))
1598                .count();
1599            for (file, metadata) in fresh_metadata {
1600                self.file_mtimes.insert(file.clone(), metadata.mtime);
1601                self.file_sizes.insert(file.clone(), metadata.size);
1602                self.file_hashes.insert(file.clone(), metadata.content_hash);
1603            }
1604            return Ok(RefreshSummary {
1605                changed: changed_count,
1606                added: added_count,
1607                deleted: deleted.len(),
1608                total_processed,
1609            });
1610        }
1611
1612        // 4. Embed in batches and dimension-check against the existing index.
1613        let total_chunks = chunks.len();
1614        progress(0, total_chunks);
1615        let batch_size = max_batch_size.max(1);
1616        let existing_dimension = if self.entries.is_empty() {
1617            None
1618        } else {
1619            Some(self.dimension)
1620        };
1621        let mut new_entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1622        let mut observed_dimension: Option<usize> = existing_dimension;
1623
1624        for batch_start in (0..chunks.len()).step_by(batch_size) {
1625            let batch_end = (batch_start + batch_size).min(chunks.len());
1626            let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1627                .iter()
1628                .map(|c| c.embed_text.clone())
1629                .collect();
1630
1631            let vectors = embed_fn(batch_texts)?;
1632            validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1633
1634            if let Some(dim) = vectors.first().map(|v| v.len()) {
1635                match observed_dimension {
1636                    None => observed_dimension = Some(dim),
1637                    Some(expected) if dim != expected => {
1638                        // Refuse to mix dimensions in one index. Caller should
1639                        // fall back to a full rebuild.
1640                        return Err(format!(
1641                            "embedding dimension changed during incremental refresh: \
1642                             cached index uses {expected}, new vectors use {dim}"
1643                        ));
1644                    }
1645                    _ => {}
1646                }
1647            }
1648
1649            for (i, vector) in vectors.into_iter().enumerate() {
1650                let chunk_idx = batch_start + i;
1651                new_entries.push(EmbeddingEntry {
1652                    chunk: chunks[chunk_idx].clone(),
1653                    vector,
1654                });
1655            }
1656
1657            progress(new_entries.len(), total_chunks);
1658        }
1659
1660        let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1661        for file in &successful_files {
1662            self.deferred_files.remove(file);
1663        }
1664        if !successful_files.is_empty() {
1665            self.entries
1666                .retain(|entry| !successful_files.contains(&entry.chunk.file));
1667        }
1668
1669        self.entries.extend(new_entries);
1670        for (file, metadata) in fresh_metadata {
1671            self.file_mtimes.insert(file.clone(), metadata.mtime);
1672            self.file_sizes.insert(file.clone(), metadata.size);
1673            self.file_hashes.insert(file, metadata.content_hash);
1674        }
1675        if let Some(dim) = observed_dimension {
1676            self.dimension = dim;
1677        }
1678
1679        Ok(RefreshSummary {
1680            changed: changed
1681                .iter()
1682                .filter(|path| successful_files.contains(*path))
1683                .count(),
1684            added: added
1685                .iter()
1686                .filter(|path| successful_files.contains(*path))
1687                .count(),
1688            deleted: deleted.len(),
1689            total_processed,
1690        })
1691    }
1692
1693    /// Refresh exactly the files invalidated by the live watcher, without
1694    /// treating the provided path list as the whole project. This is the
1695    /// watcher-side counterpart to `refresh_stale_files`: it drops any stale
1696    /// entries for the requested paths from this in-memory index, re-extracts
1697    /// whatever still exists on disk, embeds those chunks, and returns the
1698    /// delta needed for another in-memory index to apply the same update.
1699    pub fn refresh_invalidated_files<F, P>(
1700        &mut self,
1701        project_root: &Path,
1702        paths: &[PathBuf],
1703        embed_fn: &mut F,
1704        max_batch_size: usize,
1705        max_files: usize,
1706        progress: &mut P,
1707    ) -> Result<InvalidatedFilesRefresh, String>
1708    where
1709        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1710        P: FnMut(usize, usize),
1711    {
1712        self.backfill_missing_file_sizes();
1713
1714        self.deferred_files.retain(|path| path.exists());
1715        let mut requested_paths = paths.to_vec();
1716        requested_paths.extend(self.deferred_files.iter().cloned());
1717        requested_paths.sort();
1718        requested_paths.dedup();
1719        let total_processed = requested_paths.len();
1720
1721        if requested_paths.is_empty() {
1722            progress(0, 0);
1723            return Ok(InvalidatedFilesRefresh {
1724                summary: RefreshSummary {
1725                    total_processed,
1726                    ..RefreshSummary::default()
1727                },
1728                ..InvalidatedFilesRefresh::default()
1729            });
1730        }
1731
1732        let previously_indexed: HashSet<PathBuf> = requested_paths
1733            .iter()
1734            .filter(|path| self.file_mtimes.contains_key(*path))
1735            .cloned()
1736            .collect();
1737
1738        // The watcher path has already invalidated these files in the request
1739        // thread's live index. Mirror that behavior here before inserting any
1740        // fresh chunks so parse/read failures do not resurrect stale entries.
1741        self.remove_indexed_files(&requested_paths);
1742
1743        let existing_paths = requested_paths
1744            .iter()
1745            .filter(|path| path.exists())
1746            .cloned()
1747            .collect::<Vec<_>>();
1748        let deleted = requested_paths
1749            .iter()
1750            .filter(|path| !path.exists() && previously_indexed.contains(path.as_path()))
1751            .count();
1752
1753        if existing_paths.is_empty() {
1754            for path in &requested_paths {
1755                if !path.exists() {
1756                    self.deferred_files.remove(path);
1757                }
1758            }
1759            progress(0, 0);
1760            return Ok(InvalidatedFilesRefresh {
1761                completed_paths: requested_paths,
1762                summary: RefreshSummary {
1763                    deleted,
1764                    total_processed,
1765                    ..RefreshSummary::default()
1766                },
1767                ..InvalidatedFilesRefresh::default()
1768            });
1769        }
1770
1771        let (mut chunks, mut fresh_metadata) = Self::collect_chunks(project_root, &existing_paths);
1772
1773        let retained_file_count = self.file_mtimes.len();
1774        let changed_successful_count = existing_paths
1775            .iter()
1776            .filter(|path| {
1777                previously_indexed.contains(path.as_path()) && fresh_metadata.contains_key(*path)
1778            })
1779            .count();
1780        let available_new_files =
1781            max_files.saturating_sub(retained_file_count.saturating_add(changed_successful_count));
1782        let new_successful_files = existing_paths
1783            .iter()
1784            .filter(|path| {
1785                !previously_indexed.contains(path.as_path()) && fresh_metadata.contains_key(*path)
1786            })
1787            .cloned()
1788            .collect::<Vec<_>>();
1789        if new_successful_files.len() > available_new_files {
1790            let allowed_new_files = new_successful_files
1791                .iter()
1792                .take(available_new_files)
1793                .cloned()
1794                .collect::<HashSet<_>>();
1795            let deferred_new_files = new_successful_files
1796                .into_iter()
1797                .filter(|path| !allowed_new_files.contains(path))
1798                .collect::<HashSet<_>>();
1799
1800            fresh_metadata.retain(|file, _| {
1801                previously_indexed.contains(file.as_path()) || allowed_new_files.contains(file)
1802            });
1803            chunks.retain(|chunk| !deferred_new_files.contains(&chunk.file));
1804
1805            if !deferred_new_files.is_empty() {
1806                for path in &deferred_new_files {
1807                    self.deferred_files.insert(path.clone());
1808                }
1809                slog_warn!(
1810                    "semantic refresh deferred {} new file(s): indexed-file cap {} is reached",
1811                    deferred_new_files.len(),
1812                    max_files
1813                );
1814            }
1815        }
1816
1817        let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1818        for file in &successful_files {
1819            self.deferred_files.remove(file);
1820        }
1821        let changed = successful_files
1822            .iter()
1823            .filter(|path| previously_indexed.contains(path.as_path()))
1824            .count();
1825        let added = successful_files.len().saturating_sub(changed);
1826        let mut updated_metadata = Vec::with_capacity(fresh_metadata.len());
1827
1828        if chunks.is_empty() {
1829            progress(0, 0);
1830            for (file, metadata) in fresh_metadata {
1831                let freshness = FileFreshness {
1832                    mtime: metadata.mtime,
1833                    size: metadata.size,
1834                    content_hash: metadata.content_hash,
1835                };
1836                self.file_mtimes.insert(file.clone(), freshness.mtime);
1837                self.file_sizes.insert(file.clone(), freshness.size);
1838                self.file_hashes
1839                    .insert(file.clone(), freshness.content_hash);
1840                updated_metadata.push((file, freshness));
1841            }
1842
1843            return Ok(InvalidatedFilesRefresh {
1844                updated_metadata,
1845                completed_paths: requested_paths,
1846                summary: RefreshSummary {
1847                    changed,
1848                    added,
1849                    deleted,
1850                    total_processed,
1851                },
1852                ..InvalidatedFilesRefresh::default()
1853            });
1854        }
1855
1856        let total_chunks = chunks.len();
1857        progress(0, total_chunks);
1858        let batch_size = max_batch_size.max(1);
1859        let mut observed_dimension = if self.entries.is_empty() && previously_indexed.is_empty() {
1860            None
1861        } else {
1862            Some(self.dimension)
1863        };
1864        let mut new_entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1865
1866        for batch_start in (0..chunks.len()).step_by(batch_size) {
1867            let batch_end = (batch_start + batch_size).min(chunks.len());
1868            let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1869                .iter()
1870                .map(|chunk| chunk.embed_text.clone())
1871                .collect();
1872
1873            let vectors = embed_fn(batch_texts)?;
1874            validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1875
1876            if let Some(dim) = vectors.first().map(|vector| vector.len()) {
1877                match observed_dimension {
1878                    None => observed_dimension = Some(dim),
1879                    Some(expected) if dim != expected => {
1880                        return Err(format!(
1881                            "embedding dimension changed during invalidated-file refresh: \
1882                             cached index uses {expected}, new vectors use {dim}"
1883                        ));
1884                    }
1885                    _ => {}
1886                }
1887            }
1888
1889            for (i, vector) in vectors.into_iter().enumerate() {
1890                let chunk_idx = batch_start + i;
1891                new_entries.push(EmbeddingEntry {
1892                    chunk: chunks[chunk_idx].clone(),
1893                    vector,
1894                });
1895            }
1896
1897            progress(new_entries.len(), total_chunks);
1898        }
1899
1900        let added_entries = new_entries.clone();
1901        self.entries.extend(new_entries);
1902        for (file, metadata) in fresh_metadata {
1903            let freshness = FileFreshness {
1904                mtime: metadata.mtime,
1905                size: metadata.size,
1906                content_hash: metadata.content_hash,
1907            };
1908            self.file_mtimes.insert(file.clone(), freshness.mtime);
1909            self.file_sizes.insert(file.clone(), freshness.size);
1910            self.file_hashes
1911                .insert(file.clone(), freshness.content_hash);
1912            updated_metadata.push((file, freshness));
1913        }
1914        if let Some(dim) = observed_dimension {
1915            self.dimension = dim;
1916        }
1917
1918        Ok(InvalidatedFilesRefresh {
1919            added_entries,
1920            updated_metadata,
1921            completed_paths: requested_paths,
1922            summary: RefreshSummary {
1923                changed,
1924                added,
1925                deleted,
1926                total_processed,
1927            },
1928        })
1929    }
1930
1931    pub fn apply_refresh_update(
1932        &mut self,
1933        added_entries: Vec<EmbeddingEntry>,
1934        updated_metadata: Vec<(PathBuf, FileFreshness)>,
1935        completed_paths: &[PathBuf],
1936    ) {
1937        self.remove_indexed_files(completed_paths);
1938
1939        let observed_dimension = added_entries.first().map(|entry| entry.vector.len());
1940        self.entries.extend(added_entries);
1941        for (file, freshness) in updated_metadata {
1942            self.file_mtimes.insert(file.clone(), freshness.mtime);
1943            self.file_sizes.insert(file.clone(), freshness.size);
1944            self.file_hashes.insert(file, freshness.content_hash);
1945        }
1946        if let Some(dim) = observed_dimension {
1947            self.dimension = dim;
1948        }
1949    }
1950
1951    fn remove_indexed_files(&mut self, files: &[PathBuf]) {
1952        let deleted_set: HashSet<&Path> = files.iter().map(PathBuf::as_path).collect();
1953        self.entries
1954            .retain(|entry| !deleted_set.contains(entry.chunk.file.as_path()));
1955        for path in files {
1956            self.file_mtimes.remove(path);
1957            self.file_sizes.remove(path);
1958            self.file_hashes.remove(path);
1959        }
1960    }
1961
1962    /// Search the index with a query embedding, returning top-K results sorted by relevance
1963    pub fn search(&self, query_vector: &[f32], top_k: usize) -> Vec<SemanticResult> {
1964        if self.entries.is_empty() || query_vector.len() != self.dimension {
1965            return Vec::new();
1966        }
1967
1968        let mut scored: Vec<(f32, usize)> = self
1969            .entries
1970            .iter()
1971            .enumerate()
1972            .map(|(i, entry)| {
1973                let mut score = cosine_similarity(query_vector, &entry.vector);
1974                if entry.chunk.exported {
1975                    score *= 1.1;
1976                }
1977                (score, i)
1978            })
1979            .collect();
1980
1981        // Sort descending by score
1982        scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
1983
1984        scored
1985            .into_iter()
1986            .take(top_k)
1987            // Keep the sort → take → map ordering explicit: removing the old
1988            // `> 0.0` floor cannot evict positive hits because top_k has already
1989            // been selected, but it can surface zero-score noise in the tail.
1990            .map(|(score, idx)| {
1991                let entry = &self.entries[idx];
1992                SemanticResult {
1993                    file: entry.chunk.file.clone(),
1994                    name: entry.chunk.name.clone(),
1995                    kind: entry.chunk.kind.clone(),
1996                    start_line: entry.chunk.start_line,
1997                    end_line: entry.chunk.end_line,
1998                    exported: entry.chunk.exported,
1999                    snippet: entry.chunk.snippet.clone(),
2000                    score,
2001                    source: "semantic",
2002                }
2003            })
2004            .collect()
2005    }
2006
2007    /// Number of indexed entries
2008    pub fn len(&self) -> usize {
2009        self.entries.len()
2010    }
2011
2012    /// Check if a file needs re-indexing based on mtime/size
2013    pub fn is_file_stale(&self, file: &Path) -> bool {
2014        let Some(stored_mtime) = self.file_mtimes.get(file) else {
2015            return true;
2016        };
2017        let Some(stored_size) = self.file_sizes.get(file) else {
2018            return true;
2019        };
2020        let Some(stored_hash) = self.file_hashes.get(file) else {
2021            return true;
2022        };
2023        let cached = FileFreshness {
2024            mtime: *stored_mtime,
2025            size: *stored_size,
2026            content_hash: *stored_hash,
2027        };
2028        match cache_freshness::verify_file_strict(file, &cached) {
2029            FreshnessVerdict::HotFresh => false,
2030            FreshnessVerdict::ContentFresh { .. } => false,
2031            FreshnessVerdict::Stale | FreshnessVerdict::Deleted => true,
2032        }
2033    }
2034
2035    fn backfill_missing_file_sizes(&mut self) {
2036        for path in self.file_mtimes.keys() {
2037            if self.file_sizes.contains_key(path) {
2038                continue;
2039            }
2040            if let Ok(metadata) = fs::metadata(path) {
2041                self.file_sizes.insert(path.clone(), metadata.len());
2042                if let Ok(Some(hash)) = cache_freshness::hash_file_if_small(path, metadata.len()) {
2043                    self.file_hashes.insert(path.clone(), hash);
2044                }
2045            }
2046        }
2047    }
2048
2049    /// Remove entries for a specific file
2050    pub fn remove_file(&mut self, file: &Path) {
2051        self.invalidate_file(file);
2052    }
2053
2054    pub fn invalidate_file(&mut self, file: &Path) {
2055        let canonical_file = canonicalize_existing_or_deleted_path(file);
2056        self.entries
2057            .retain(|e| e.chunk.file != file && e.chunk.file != canonical_file);
2058        self.file_mtimes.remove(file);
2059        self.file_sizes.remove(file);
2060        self.file_hashes.remove(file);
2061        if canonical_file.as_path() != file {
2062            self.file_mtimes.remove(&canonical_file);
2063            self.file_sizes.remove(&canonical_file);
2064            self.file_hashes.remove(&canonical_file);
2065        }
2066    }
2067
2068    /// Get the embedding dimension
2069    pub fn dimension(&self) -> usize {
2070        self.dimension
2071    }
2072
2073    pub fn fingerprint(&self) -> Option<&SemanticIndexFingerprint> {
2074        self.fingerprint.as_ref()
2075    }
2076
2077    pub fn backend_label(&self) -> Option<&str> {
2078        self.fingerprint.as_ref().map(|f| f.backend.as_str())
2079    }
2080
2081    pub fn model_label(&self) -> Option<&str> {
2082        self.fingerprint.as_ref().map(|f| f.model.as_str())
2083    }
2084
2085    pub fn set_fingerprint(&mut self, fingerprint: SemanticIndexFingerprint) {
2086        self.fingerprint = Some(fingerprint);
2087    }
2088
2089    /// Write the semantic index to disk using atomic temp+rename pattern
2090    pub fn write_to_disk(&self, storage_dir: &Path, project_key: &str) {
2091        // Don't persist empty indexes — they would be loaded on next startup
2092        // and prevent a fresh build that might find files.
2093        if self.entries.is_empty() {
2094            slog_info!("skipping semantic index persistence (0 entries)");
2095            return;
2096        }
2097        let dir = storage_dir.join("semantic").join(project_key);
2098        if let Err(e) = fs::create_dir_all(&dir) {
2099            slog_warn!("failed to create semantic cache dir: {}", e);
2100            return;
2101        }
2102        let data_path = dir.join("semantic.bin");
2103        let tmp_path = dir.join(format!(
2104            "semantic.bin.tmp.{}.{}",
2105            std::process::id(),
2106            SystemTime::now()
2107                .duration_since(SystemTime::UNIX_EPOCH)
2108                .unwrap_or(Duration::ZERO)
2109                .as_nanos()
2110        ));
2111        let bytes = self.to_bytes();
2112        let write_result = (|| -> std::io::Result<()> {
2113            use std::io::Write;
2114            let mut file = fs::File::create(&tmp_path)?;
2115            file.write_all(&bytes)?;
2116            file.sync_all()?;
2117            Ok(())
2118        })();
2119        if let Err(e) = write_result {
2120            slog_warn!("failed to write semantic index: {}", e);
2121            let _ = fs::remove_file(&tmp_path);
2122            return;
2123        }
2124        if let Err(e) = fs::rename(&tmp_path, &data_path) {
2125            slog_warn!("failed to rename semantic index: {}", e);
2126            let _ = fs::remove_file(&tmp_path);
2127            return;
2128        }
2129        slog_info!(
2130            "semantic index persisted: {} entries, {:.1} KB",
2131            self.entries.len(),
2132            bytes.len() as f64 / 1024.0
2133        );
2134    }
2135
2136    /// Read the semantic index from disk
2137    pub fn read_from_disk(
2138        storage_dir: &Path,
2139        project_key: &str,
2140        current_canonical_root: &Path,
2141        is_worktree_bridge: bool,
2142        expected_fingerprint: Option<&str>,
2143    ) -> Option<Self> {
2144        debug_assert!(current_canonical_root.is_absolute());
2145        let data_path = storage_dir
2146            .join("semantic")
2147            .join(project_key)
2148            .join("semantic.bin");
2149        let file_len = usize::try_from(fs::metadata(&data_path).ok()?.len()).ok()?;
2150        if file_len < HEADER_BYTES_V1 {
2151            slog_warn!(
2152                "corrupt semantic index (too small: {} bytes), removing",
2153                file_len
2154            );
2155            if !is_worktree_bridge {
2156                let _ = fs::remove_file(&data_path);
2157            }
2158            return None;
2159        }
2160
2161        let bytes = fs::read(&data_path).ok()?;
2162        let version = bytes[0];
2163        if version != SEMANTIC_INDEX_VERSION_V6 {
2164            slog_info!(
2165                "cached semantic index version {} is older than {}, rebuilding",
2166                version,
2167                SEMANTIC_INDEX_VERSION_V6
2168            );
2169            if !is_worktree_bridge {
2170                let _ = fs::remove_file(&data_path);
2171            }
2172            return None;
2173        }
2174        match Self::from_bytes(&bytes, current_canonical_root) {
2175            Ok(index) => {
2176                if index.entries.is_empty() {
2177                    slog_info!("cached semantic index is empty, will rebuild");
2178                    if !is_worktree_bridge {
2179                        let _ = fs::remove_file(&data_path);
2180                    }
2181                    return None;
2182                }
2183                if let Some(expected) = expected_fingerprint {
2184                    let matches = index
2185                        .fingerprint()
2186                        .map(|fingerprint| fingerprint.matches_expected(expected))
2187                        .unwrap_or(false);
2188                    if !matches {
2189                        slog_info!("cached semantic index fingerprint mismatch, rebuilding");
2190                        if !is_worktree_bridge {
2191                            let _ = fs::remove_file(&data_path);
2192                        }
2193                        return None;
2194                    }
2195                }
2196                slog_info!(
2197                    "loaded semantic index from disk: {} entries",
2198                    index.entries.len()
2199                );
2200                Some(index)
2201            }
2202            Err(e) => {
2203                slog_warn!("corrupt semantic index, rebuilding: {}", e);
2204                if !is_worktree_bridge {
2205                    let _ = fs::remove_file(&data_path);
2206                }
2207                None
2208            }
2209        }
2210    }
2211
2212    /// Serialize the index to bytes for disk persistence
2213    pub fn to_bytes(&self) -> Vec<u8> {
2214        let mut buf = Vec::new();
2215        let fingerprint_bytes = self.fingerprint.as_ref().and_then(|fingerprint| {
2216            let encoded = fingerprint.as_string();
2217            if encoded.is_empty() {
2218                None
2219            } else {
2220                Some(encoded.into_bytes())
2221            }
2222        });
2223        let file_mtimes: Vec<_> = self
2224            .file_mtimes
2225            .iter()
2226            .filter_map(|(path, mtime)| {
2227                cache_relative_path(&self.project_root, path)
2228                    .map(|relative| (relative, path, mtime))
2229            })
2230            .collect();
2231        let entries: Vec<_> = self
2232            .entries
2233            .iter()
2234            .filter_map(|entry| {
2235                cache_relative_path(&self.project_root, &entry.chunk.file)
2236                    .map(|relative| (relative, entry))
2237            })
2238            .collect();
2239
2240        // Header: version(1) + dimension(4) + entry_count(4) + fingerprint_len(4) + fingerprint
2241        //
2242        // V6 is the single write format. Layout extends V5:
2243        //   - fingerprint is always represented (absent ⇒ fingerprint_len=0,
2244        //     no bytes follow). Uniform format simplifies the reader.
2245        //   - paths are relative to project_root.
2246        //   - file metadata stored as secs(u64) + subsec_nanos(u32) + size(u64) + blake3(32).
2247        //     Preserves full APFS/ext4/NTFS precision and catches mtime ties.
2248        //
2249        // V1/V2 remain readable for backward compatibility (see from_bytes).
2250        // V3/V4 load as compatible formats but are rejected on disk so snippets
2251        // and file sizes are rebuilt once.
2252        let version = SEMANTIC_INDEX_VERSION_V6;
2253        buf.push(version);
2254        buf.extend_from_slice(&(self.dimension as u32).to_le_bytes());
2255        buf.extend_from_slice(&(entries.len() as u32).to_le_bytes());
2256        let fp_bytes_ref: &[u8] = fingerprint_bytes.as_deref().unwrap_or(&[]);
2257        buf.extend_from_slice(&(fp_bytes_ref.len() as u32).to_le_bytes());
2258        buf.extend_from_slice(fp_bytes_ref);
2259
2260        // File mtime table: count(4) + entries
2261        // V3 layout per entry: path_len(4) + path + secs(8) + subsec_nanos(4)
2262        buf.extend_from_slice(&(file_mtimes.len() as u32).to_le_bytes());
2263        for (relative, path, mtime) in &file_mtimes {
2264            let path_bytes = relative.to_string_lossy().as_bytes().to_vec();
2265            buf.extend_from_slice(&(path_bytes.len() as u32).to_le_bytes());
2266            buf.extend_from_slice(&path_bytes);
2267            let duration = mtime
2268                .duration_since(SystemTime::UNIX_EPOCH)
2269                .unwrap_or_default();
2270            buf.extend_from_slice(&duration.as_secs().to_le_bytes());
2271            buf.extend_from_slice(&duration.subsec_nanos().to_le_bytes());
2272            let size = self.file_sizes.get(*path).copied().unwrap_or_default();
2273            buf.extend_from_slice(&size.to_le_bytes());
2274            let hash = self
2275                .file_hashes
2276                .get(*path)
2277                .copied()
2278                .unwrap_or_else(cache_freshness::zero_hash);
2279            buf.extend_from_slice(hash.as_bytes());
2280        }
2281
2282        // Entries: each is metadata + vector
2283        for (relative, entry) in &entries {
2284            let c = &entry.chunk;
2285
2286            // File path
2287            let file_bytes = relative.to_string_lossy().as_bytes().to_vec();
2288            buf.extend_from_slice(&(file_bytes.len() as u32).to_le_bytes());
2289            buf.extend_from_slice(&file_bytes);
2290
2291            // Name
2292            let name_bytes = c.name.as_bytes();
2293            buf.extend_from_slice(&(name_bytes.len() as u32).to_le_bytes());
2294            buf.extend_from_slice(name_bytes);
2295
2296            // Kind (1 byte)
2297            buf.push(symbol_kind_to_u8(&c.kind));
2298
2299            // Lines + exported
2300            buf.extend_from_slice(&(c.start_line as u32).to_le_bytes());
2301            buf.extend_from_slice(&(c.end_line as u32).to_le_bytes());
2302            buf.push(c.exported as u8);
2303
2304            // Snippet
2305            let snippet_bytes = c.snippet.as_bytes();
2306            buf.extend_from_slice(&(snippet_bytes.len() as u32).to_le_bytes());
2307            buf.extend_from_slice(snippet_bytes);
2308
2309            // Embed text
2310            let embed_bytes = c.embed_text.as_bytes();
2311            buf.extend_from_slice(&(embed_bytes.len() as u32).to_le_bytes());
2312            buf.extend_from_slice(embed_bytes);
2313
2314            // Vector (f32 array)
2315            for &val in &entry.vector {
2316                buf.extend_from_slice(&val.to_le_bytes());
2317            }
2318        }
2319
2320        buf
2321    }
2322
2323    /// Deserialize the index from bytes
2324    pub fn from_bytes(data: &[u8], current_canonical_root: &Path) -> Result<Self, String> {
2325        debug_assert!(current_canonical_root.is_absolute());
2326        let mut pos = 0;
2327
2328        if data.len() < HEADER_BYTES_V1 {
2329            return Err("data too short".to_string());
2330        }
2331
2332        let version = data[pos];
2333        pos += 1;
2334        if version != SEMANTIC_INDEX_VERSION_V1
2335            && version != SEMANTIC_INDEX_VERSION_V2
2336            && version != SEMANTIC_INDEX_VERSION_V3
2337            && version != SEMANTIC_INDEX_VERSION_V4
2338            && version != SEMANTIC_INDEX_VERSION_V5
2339            && version != SEMANTIC_INDEX_VERSION_V6
2340        {
2341            return Err(format!("unsupported version: {}", version));
2342        }
2343        // V2 and newer share the same header layout (V3/V4/V5 only differ from
2344        // V2 in the per-mtime entry layout): version(1) + dimension(4) +
2345        // entry_count(4) + fingerprint_len(4) + fingerprint bytes.
2346        if (version == SEMANTIC_INDEX_VERSION_V2
2347            || version == SEMANTIC_INDEX_VERSION_V3
2348            || version == SEMANTIC_INDEX_VERSION_V4
2349            || version == SEMANTIC_INDEX_VERSION_V5
2350            || version == SEMANTIC_INDEX_VERSION_V6)
2351            && data.len() < HEADER_BYTES_V2
2352        {
2353            return Err("data too short for semantic index v2/v3/v4/v5/v6 header".to_string());
2354        }
2355
2356        let dimension = read_u32(data, &mut pos)? as usize;
2357        let entry_count = read_u32(data, &mut pos)? as usize;
2358        validate_embedding_dimension(dimension)?;
2359        if entry_count > MAX_ENTRIES {
2360            return Err(format!("too many semantic index entries: {}", entry_count));
2361        }
2362
2363        // Fingerprint handling:
2364        //   - V1: no fingerprint field at all.
2365        //   - V2: fingerprint_len + fingerprint bytes; always present (writer
2366        //     only emitted V2 when fingerprint was Some).
2367        //   - V3+: fingerprint_len always present; fingerprint_len==0 ⇒ None.
2368        let has_fingerprint_field = version == SEMANTIC_INDEX_VERSION_V2
2369            || version == SEMANTIC_INDEX_VERSION_V3
2370            || version == SEMANTIC_INDEX_VERSION_V4
2371            || version == SEMANTIC_INDEX_VERSION_V5
2372            || version == SEMANTIC_INDEX_VERSION_V6;
2373        let fingerprint = if has_fingerprint_field {
2374            let fingerprint_len = read_u32(data, &mut pos)? as usize;
2375            if pos + fingerprint_len > data.len() {
2376                return Err("unexpected end of data reading fingerprint".to_string());
2377            }
2378            if fingerprint_len == 0 {
2379                None
2380            } else {
2381                let raw = String::from_utf8_lossy(&data[pos..pos + fingerprint_len]).to_string();
2382                pos += fingerprint_len;
2383                Some(
2384                    serde_json::from_str::<SemanticIndexFingerprint>(&raw)
2385                        .map_err(|error| format!("invalid semantic fingerprint: {error}"))?,
2386                )
2387            }
2388        } else {
2389            None
2390        };
2391
2392        // File mtimes
2393        let mtime_count = read_u32(data, &mut pos)? as usize;
2394        if mtime_count > MAX_ENTRIES {
2395            return Err(format!("too many semantic file mtimes: {}", mtime_count));
2396        }
2397
2398        let vector_bytes = entry_count
2399            .checked_mul(dimension)
2400            .and_then(|count| count.checked_mul(F32_BYTES))
2401            .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
2402        if vector_bytes > data.len().saturating_sub(pos) {
2403            return Err("semantic index vectors exceed available data".to_string());
2404        }
2405
2406        let mut file_mtimes = HashMap::with_capacity(mtime_count);
2407        let mut file_sizes = HashMap::with_capacity(mtime_count);
2408        let mut file_hashes = HashMap::with_capacity(mtime_count);
2409        for _ in 0..mtime_count {
2410            let path = read_string(data, &mut pos)?;
2411            let secs = read_u64(data, &mut pos)?;
2412            // V3+ persists subsec_nanos alongside secs so staleness checks
2413            // survive restart round-trips. V1/V2 load with 0 nanos, which
2414            // causes one rebuild on upgrade (they never matched live APFS
2415            // mtimes anyway — the bug v0.15.2 fixes). After that rebuild,
2416            // the cache is persisted as V3 and stabilises.
2417            let nanos = if version == SEMANTIC_INDEX_VERSION_V3
2418                || version == SEMANTIC_INDEX_VERSION_V4
2419                || version == SEMANTIC_INDEX_VERSION_V5
2420                || version == SEMANTIC_INDEX_VERSION_V6
2421            {
2422                read_u32(data, &mut pos)?
2423            } else {
2424                0
2425            };
2426            let size =
2427                if version == SEMANTIC_INDEX_VERSION_V5 || version == SEMANTIC_INDEX_VERSION_V6 {
2428                    read_u64(data, &mut pos)?
2429                } else {
2430                    0
2431                };
2432            let content_hash = if version == SEMANTIC_INDEX_VERSION_V6 {
2433                if pos + 32 > data.len() {
2434                    return Err("unexpected end of data reading content hash".to_string());
2435                }
2436                let mut hash_bytes = [0u8; 32];
2437                hash_bytes.copy_from_slice(&data[pos..pos + 32]);
2438                pos += 32;
2439                blake3::Hash::from_bytes(hash_bytes)
2440            } else {
2441                cache_freshness::zero_hash()
2442            };
2443            // Hardening against corrupt / maliciously crafted cache files
2444            // (v0.15.2). `Duration::new(secs, nanos)` can panic when the
2445            // nanosecond carry overflows the second counter, and
2446            // `SystemTime + Duration` can panic on carry past the platform's
2447            // upper bound. Explicit validation keeps a corrupted semantic.bin
2448            // from taking down the whole aft process.
2449            if nanos >= 1_000_000_000 {
2450                return Err(format!(
2451                    "invalid semantic mtime: nanos {} >= 1_000_000_000",
2452                    nanos
2453                ));
2454            }
2455            let duration = std::time::Duration::new(secs, nanos);
2456            let mtime = SystemTime::UNIX_EPOCH
2457                .checked_add(duration)
2458                .ok_or_else(|| {
2459                    format!(
2460                        "invalid semantic mtime: secs={} nanos={} overflows SystemTime",
2461                        secs, nanos
2462                    )
2463                })?;
2464            let path = if version == SEMANTIC_INDEX_VERSION_V6 {
2465                cached_path_under_root(current_canonical_root, &PathBuf::from(path))
2466                    .ok_or_else(|| "cached semantic mtime path escapes project root".to_string())?
2467            } else {
2468                PathBuf::from(path)
2469            };
2470            file_mtimes.insert(path.clone(), mtime);
2471            file_sizes.insert(path.clone(), size);
2472            file_hashes.insert(path, content_hash);
2473        }
2474
2475        // Entries
2476        let mut entries = Vec::with_capacity(entry_count);
2477        for _ in 0..entry_count {
2478            let raw_file = PathBuf::from(read_string(data, &mut pos)?);
2479            let file = if version == SEMANTIC_INDEX_VERSION_V6 {
2480                cached_path_under_root(current_canonical_root, &raw_file)
2481                    .ok_or_else(|| "cached semantic entry path escapes project root".to_string())?
2482            } else {
2483                raw_file
2484            };
2485            let name = read_string(data, &mut pos)?;
2486
2487            if pos >= data.len() {
2488                return Err("unexpected end of data".to_string());
2489            }
2490            let kind = u8_to_symbol_kind(data[pos]);
2491            pos += 1;
2492
2493            let start_line = read_u32(data, &mut pos)?;
2494            let end_line = read_u32(data, &mut pos)?;
2495
2496            if pos >= data.len() {
2497                return Err("unexpected end of data".to_string());
2498            }
2499            let exported = data[pos] != 0;
2500            pos += 1;
2501
2502            let snippet = read_string(data, &mut pos)?;
2503            let embed_text = read_string(data, &mut pos)?;
2504
2505            // Vector
2506            let vec_bytes = dimension
2507                .checked_mul(F32_BYTES)
2508                .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
2509            if pos + vec_bytes > data.len() {
2510                return Err("unexpected end of data reading vector".to_string());
2511            }
2512            let mut vector = Vec::with_capacity(dimension);
2513            for _ in 0..dimension {
2514                let bytes = [data[pos], data[pos + 1], data[pos + 2], data[pos + 3]];
2515                vector.push(f32::from_le_bytes(bytes));
2516                pos += 4;
2517            }
2518
2519            entries.push(EmbeddingEntry {
2520                chunk: SemanticChunk {
2521                    file,
2522                    name,
2523                    kind,
2524                    start_line,
2525                    end_line,
2526                    exported,
2527                    embed_text,
2528                    snippet,
2529                },
2530                vector,
2531            });
2532        }
2533
2534        if entries.len() != entry_count {
2535            return Err(format!(
2536                "semantic cache entry count drift: header={} decoded={}",
2537                entry_count,
2538                entries.len()
2539            ));
2540        }
2541        for entry in &entries {
2542            if !file_mtimes.contains_key(&entry.chunk.file) {
2543                return Err(format!(
2544                    "semantic cache metadata missing for entry file {}",
2545                    entry.chunk.file.display()
2546                ));
2547            }
2548        }
2549
2550        Ok(Self {
2551            entries,
2552            file_mtimes,
2553            file_sizes,
2554            file_hashes,
2555            dimension,
2556            fingerprint,
2557            project_root: current_canonical_root.to_path_buf(),
2558            deferred_files: HashSet::new(),
2559        })
2560    }
2561}
2562
2563/// Build enriched embedding text from a symbol with cAST-style context
2564fn build_embed_text(symbol: &Symbol, source: &str, file: &Path, project_root: &Path) -> String {
2565    let relative = file
2566        .strip_prefix(project_root)
2567        .unwrap_or(file)
2568        .to_string_lossy();
2569
2570    let kind_label = match &symbol.kind {
2571        SymbolKind::Function => "function",
2572        SymbolKind::Class => "class",
2573        SymbolKind::Method => "method",
2574        SymbolKind::Struct => "struct",
2575        SymbolKind::Interface => "interface",
2576        SymbolKind::Enum => "enum",
2577        SymbolKind::TypeAlias => "type",
2578        SymbolKind::Variable => "variable",
2579        SymbolKind::Heading => "heading",
2580        SymbolKind::FileSummary => "file-summary",
2581    };
2582
2583    // Build: "file:relative/path kind:function name:validateAuth signature:fn validateAuth(token: &str) -> bool"
2584    let name = &symbol.name;
2585    let mut text = format!(
2586        "name:{name} file:{} kind:{} name:{name}",
2587        relative, kind_label
2588    );
2589
2590    if let Some(sig) = &symbol.signature {
2591        // Cap the signature: structured parsers (e.g. YAML/Kubernetes) pack
2592        // entire inline scripts (CronJob/Job `command:` bodies, multi-KB) into
2593        // the signature. Appending it unbounded produces a single embed_text
2594        // that overflows the embedding backend's physical batch (e.g. a
2595        // llama.cpp server's 512-token cap), aborting the whole index build
2596        // and silently degrading every search to lexical. 400 chars keeps the
2597        // identifying head of the signature without blowing the budget.
2598        text.push_str(&format!(" signature:{}", truncate_chars(sig, 400)));
2599    }
2600
2601    // Add body snippet (first ~300 chars of symbol body)
2602    let lines: Vec<&str> = source.lines().collect();
2603    let start = (symbol.range.start_line as usize).min(lines.len());
2604    // range.end_line is inclusive 0-based; +1 makes it an exclusive slice bound.
2605    let end = (symbol.range.end_line as usize + 1).min(lines.len());
2606    if start < end {
2607        let body: String = lines[start..end]
2608            .iter()
2609            .take(15) // max 15 lines
2610            .copied()
2611            .collect::<Vec<&str>>()
2612            .join("\n");
2613        let snippet = if body.len() > 300 {
2614            format!("{}...", &body[..body.floor_char_boundary(300)])
2615        } else {
2616            body
2617        };
2618        text.push_str(&format!(" body:{}", snippet));
2619    }
2620
2621    // Final defense-in-depth clamp: no single embed_text may exceed the
2622    // backend's per-input budget regardless of which field grew. Most
2623    // backends cap a physical batch around 512 tokens; ~1600 chars stays
2624    // comfortably under that for typical English/code (≈4 chars/token).
2625    truncate_chars(&text, MAX_EMBED_TEXT_CHARS)
2626}
2627
2628/// Upper bound on characters in a single chunk's `embed_text`. Keeps any one
2629/// input below typical embedding-backend physical batch limits (~512 tokens)
2630/// so an oversized symbol cannot abort the whole index build.
2631const MAX_EMBED_TEXT_CHARS: usize = 1600;
2632
2633fn truncate_chars(value: &str, max_chars: usize) -> String {
2634    value.chars().take(max_chars).collect()
2635}
2636
2637fn first_leading_doc_comment(source: &str) -> String {
2638    let lines: Vec<&str> = source.lines().collect();
2639    let Some((start, first)) = lines
2640        .iter()
2641        .enumerate()
2642        .find(|(_, line)| !line.trim().is_empty())
2643    else {
2644        return String::new();
2645    };
2646
2647    let trimmed = first.trim_start();
2648    if trimmed.starts_with("/**") {
2649        let mut comment = Vec::new();
2650        for line in lines.iter().skip(start) {
2651            comment.push(*line);
2652            if line.contains("*/") {
2653                break;
2654            }
2655        }
2656        return truncate_chars(&comment.join("\n"), 200);
2657    }
2658
2659    if trimmed.starts_with("///") || trimmed.starts_with("//!") {
2660        let comment = lines
2661            .iter()
2662            .skip(start)
2663            .take_while(|line| {
2664                let trimmed = line.trim_start();
2665                trimmed.starts_with("///") || trimmed.starts_with("//!")
2666            })
2667            .copied()
2668            .collect::<Vec<_>>()
2669            .join("\n");
2670        return truncate_chars(&comment, 200);
2671    }
2672
2673    String::new()
2674}
2675
2676pub fn build_file_summary_chunk(
2677    file: &Path,
2678    project_root: &Path,
2679    source: &str,
2680    top_exports: &[&str],
2681    top_export_signatures: &[Option<&str>],
2682) -> SemanticChunk {
2683    let relative = file.strip_prefix(project_root).unwrap_or(file);
2684    let rel_path = relative.to_string_lossy();
2685    let parent_dir = relative
2686        .parent()
2687        .map(|parent| parent.to_string_lossy().to_string())
2688        .unwrap_or_default();
2689    let name = file
2690        .file_stem()
2691        .map(|stem| stem.to_string_lossy().to_string())
2692        .unwrap_or_default();
2693    let doc = first_leading_doc_comment(source);
2694    let exports = top_exports
2695        .iter()
2696        .take(5)
2697        .copied()
2698        .collect::<Vec<_>>()
2699        .join(",");
2700    let snippet = if doc.is_empty() {
2701        top_export_signatures
2702            .first()
2703            .and_then(|signature| signature.as_deref())
2704            .map(|signature| truncate_chars(signature, 200))
2705            .unwrap_or_default()
2706    } else {
2707        doc.clone()
2708    };
2709
2710    SemanticChunk {
2711        file: file.to_path_buf(),
2712        name,
2713        kind: SymbolKind::FileSummary,
2714        start_line: 0,
2715        end_line: 0,
2716        exported: false,
2717        embed_text: truncate_chars(
2718            &format!(
2719                "file:{rel_path} kind:file-summary name:{} parent:{parent_dir} doc:{doc} exports:{exports}",
2720                file.file_stem()
2721                    .map(|stem| stem.to_string_lossy().to_string())
2722                    .unwrap_or_default()
2723            ),
2724            MAX_EMBED_TEXT_CHARS,
2725        ),
2726        snippet,
2727    }
2728}
2729
2730fn parser_for(
2731    parsers: &mut HashMap<crate::parser::LangId, Parser>,
2732    lang: crate::parser::LangId,
2733) -> Result<&mut Parser, String> {
2734    use std::collections::hash_map::Entry;
2735
2736    match parsers.entry(lang) {
2737        Entry::Occupied(entry) => Ok(entry.into_mut()),
2738        Entry::Vacant(entry) => {
2739            let grammar = grammar_for(lang);
2740            let mut parser = Parser::new();
2741            parser
2742                .set_language(&grammar)
2743                .map_err(|error| error.to_string())?;
2744            Ok(entry.insert(parser))
2745        }
2746    }
2747}
2748
2749pub fn is_semantic_indexed_extension(path: &Path) -> bool {
2750    matches!(
2751        path.extension().and_then(|extension| extension.to_str()),
2752        Some(
2753            "ts" | "tsx"
2754                | "js"
2755                | "jsx"
2756                | "py"
2757                | "rs"
2758                | "go"
2759                | "c"
2760                | "h"
2761                | "cc"
2762                | "cpp"
2763                | "cxx"
2764                | "hpp"
2765                | "hh"
2766                | "zig"
2767                | "cs"
2768                | "sh"
2769                | "bash"
2770                | "zsh"
2771                | "inc"
2772                | "php"
2773                | "sol"
2774                | "scss"
2775                | "vue"
2776                | "yaml"
2777                | "yml"
2778        )
2779    )
2780}
2781
2782fn collect_file_metadata(file: &Path) -> Result<IndexedFileMetadata, String> {
2783    let metadata = fs::metadata(file).map_err(|error| error.to_string())?;
2784    let mtime = metadata.modified().map_err(|error| error.to_string())?;
2785    let content_hash = cache_freshness::hash_file_if_small(file, metadata.len())
2786        .map_err(|error| error.to_string())?
2787        .unwrap_or_else(cache_freshness::zero_hash);
2788    Ok(IndexedFileMetadata {
2789        mtime,
2790        size: metadata.len(),
2791        content_hash,
2792    })
2793}
2794
2795fn canonicalize_existing_or_deleted_path(path: &Path) -> PathBuf {
2796    if let Ok(canonical) = fs::canonicalize(path) {
2797        return canonical;
2798    }
2799
2800    let Some(parent) = path.parent() else {
2801        return path.to_path_buf();
2802    };
2803    let Some(file_name) = path.file_name() else {
2804        return path.to_path_buf();
2805    };
2806
2807    fs::canonicalize(parent)
2808        .map(|canonical_parent| canonical_parent.join(file_name))
2809        .unwrap_or_else(|_| path.to_path_buf())
2810}
2811
2812fn collect_file_chunks(
2813    project_root: &Path,
2814    file: &Path,
2815    parsers: &mut HashMap<crate::parser::LangId, Parser>,
2816) -> Result<Vec<SemanticChunk>, String> {
2817    if !is_semantic_indexed_extension(file) {
2818        return Err("unsupported file extension".to_string());
2819    }
2820    let lang = detect_language(file).ok_or_else(|| "unsupported file extension".to_string())?;
2821    let source = std::fs::read_to_string(file).map_err(|error| error.to_string())?;
2822    let tree = parser_for(parsers, lang)?
2823        .parse(&source, None)
2824        .ok_or_else(|| format!("tree-sitter parse returned None for {}", file.display()))?;
2825    let symbols =
2826        extract_symbols_from_tree(&source, &tree, lang).map_err(|error| error.to_string())?;
2827
2828    Ok(symbols_to_chunks(file, &symbols, &source, project_root))
2829}
2830
2831/// Build a display snippet from a symbol's source
2832fn build_snippet(symbol: &Symbol, source: &str) -> String {
2833    let lines: Vec<&str> = source.lines().collect();
2834    let start = (symbol.range.start_line as usize).min(lines.len());
2835    // range.end_line is inclusive 0-based; +1 makes it an exclusive slice bound.
2836    let end = (symbol.range.end_line as usize + 1).min(lines.len());
2837    if start < end {
2838        let snippet_lines: Vec<&str> = lines[start..end].iter().take(5).copied().collect();
2839        let mut snippet = snippet_lines.join("\n");
2840        if end - start > 5 {
2841            snippet.push_str("\n  ...");
2842        }
2843        if snippet.len() > 300 {
2844            snippet = format!("{}...", &snippet[..snippet.floor_char_boundary(300)]);
2845        }
2846        snippet
2847    } else {
2848        String::new()
2849    }
2850}
2851
2852/// Convert symbols to semantic chunks with enriched context
2853fn symbols_to_chunks(
2854    file: &Path,
2855    symbols: &[Symbol],
2856    source: &str,
2857    project_root: &Path,
2858) -> Vec<SemanticChunk> {
2859    let mut chunks = Vec::new();
2860    let top_exports_with_signatures = symbols
2861        .iter()
2862        .filter(|symbol| {
2863            symbol.exported
2864                && symbol.parent.is_none()
2865                && !matches!(symbol.kind, SymbolKind::Heading)
2866        })
2867        .map(|symbol| (symbol.name.as_str(), symbol.signature.as_deref()))
2868        .collect::<Vec<_>>();
2869
2870    let has_only_headings = !symbols.is_empty()
2871        && symbols
2872            .iter()
2873            .all(|symbol| matches!(symbol.kind, SymbolKind::Heading));
2874    if top_exports_with_signatures.len() <= 2 && !has_only_headings {
2875        let top_exports = top_exports_with_signatures
2876            .iter()
2877            .map(|(name, _)| *name)
2878            .collect::<Vec<_>>();
2879        let top_export_signatures = top_exports_with_signatures
2880            .iter()
2881            .map(|(_, signature)| *signature)
2882            .collect::<Vec<_>>();
2883        chunks.push(build_file_summary_chunk(
2884            file,
2885            project_root,
2886            source,
2887            &top_exports,
2888            &top_export_signatures,
2889        ));
2890    }
2891
2892    for symbol in symbols {
2893        // Skip Markdown / HTML heading chunks: empirically they dominate result
2894        // lists even for code-shaped queries because heading prose embeds well.
2895        // Agents querying for code lose the actual matches under doc noise.
2896        // README/docs queries are still served by grep on the same files.
2897        if matches!(symbol.kind, SymbolKind::Heading) {
2898            continue;
2899        }
2900
2901        // Skip very small symbols (single-line variables, etc.)
2902        let line_count = symbol
2903            .range
2904            .end_line
2905            .saturating_sub(symbol.range.start_line)
2906            + 1;
2907        if line_count < 2 && !matches!(symbol.kind, SymbolKind::Variable) {
2908            continue;
2909        }
2910
2911        let embed_text = build_embed_text(symbol, source, file, project_root);
2912        let snippet = build_snippet(symbol, source);
2913
2914        chunks.push(SemanticChunk {
2915            file: file.to_path_buf(),
2916            name: symbol.name.clone(),
2917            kind: symbol.kind.clone(),
2918            start_line: symbol.range.start_line,
2919            end_line: symbol.range.end_line,
2920            exported: symbol.exported,
2921            embed_text,
2922            snippet,
2923        });
2924
2925        // Note: Nested symbols are handled separately by the outline system
2926        // Each symbol is indexed individually
2927    }
2928
2929    chunks
2930}
2931
2932/// Cosine similarity between two vectors
2933fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
2934    if a.len() != b.len() {
2935        return 0.0;
2936    }
2937
2938    let mut dot = 0.0f32;
2939    let mut norm_a = 0.0f32;
2940    let mut norm_b = 0.0f32;
2941
2942    for i in 0..a.len() {
2943        dot += a[i] * b[i];
2944        norm_a += a[i] * a[i];
2945        norm_b += b[i] * b[i];
2946    }
2947
2948    let denom = norm_a.sqrt() * norm_b.sqrt();
2949    if denom == 0.0 {
2950        0.0
2951    } else {
2952        dot / denom
2953    }
2954}
2955
2956// Serialization helpers
2957fn symbol_kind_to_u8(kind: &SymbolKind) -> u8 {
2958    match kind {
2959        SymbolKind::Function => 0,
2960        SymbolKind::Class => 1,
2961        SymbolKind::Method => 2,
2962        SymbolKind::Struct => 3,
2963        SymbolKind::Interface => 4,
2964        SymbolKind::Enum => 5,
2965        SymbolKind::TypeAlias => 6,
2966        SymbolKind::Variable => 7,
2967        SymbolKind::Heading => 8,
2968        SymbolKind::FileSummary => 9,
2969    }
2970}
2971
2972fn u8_to_symbol_kind(v: u8) -> SymbolKind {
2973    match v {
2974        0 => SymbolKind::Function,
2975        1 => SymbolKind::Class,
2976        2 => SymbolKind::Method,
2977        3 => SymbolKind::Struct,
2978        4 => SymbolKind::Interface,
2979        5 => SymbolKind::Enum,
2980        6 => SymbolKind::TypeAlias,
2981        7 => SymbolKind::Variable,
2982        8 => SymbolKind::Heading,
2983        9 => SymbolKind::FileSummary,
2984        _ => SymbolKind::Heading,
2985    }
2986}
2987
2988fn read_u32(data: &[u8], pos: &mut usize) -> Result<u32, String> {
2989    if *pos + 4 > data.len() {
2990        return Err("unexpected end of data reading u32".to_string());
2991    }
2992    let val = u32::from_le_bytes([data[*pos], data[*pos + 1], data[*pos + 2], data[*pos + 3]]);
2993    *pos += 4;
2994    Ok(val)
2995}
2996
2997fn read_u64(data: &[u8], pos: &mut usize) -> Result<u64, String> {
2998    if *pos + 8 > data.len() {
2999        return Err("unexpected end of data reading u64".to_string());
3000    }
3001    let bytes: [u8; 8] = data[*pos..*pos + 8].try_into().unwrap();
3002    *pos += 8;
3003    Ok(u64::from_le_bytes(bytes))
3004}
3005
3006fn read_string(data: &[u8], pos: &mut usize) -> Result<String, String> {
3007    let len = read_u32(data, pos)? as usize;
3008    if *pos + len > data.len() {
3009        return Err("unexpected end of data reading string".to_string());
3010    }
3011    let s = String::from_utf8_lossy(&data[*pos..*pos + len]).to_string();
3012    *pos += len;
3013    Ok(s)
3014}
3015
3016#[cfg(test)]
3017mod tests {
3018    use super::*;
3019    use crate::config::{SemanticBackend, SemanticBackendConfig};
3020    use crate::parser::FileParser;
3021    use std::io::{Read, Write};
3022    use std::net::TcpListener;
3023    use std::thread;
3024
3025    #[test]
3026    fn semantic_index_includes_php_inc_and_scss_extensions() {
3027        for file in ["partial.inc", "index.php", "styles.scss"] {
3028            assert!(
3029                is_semantic_indexed_extension(Path::new(file)),
3030                "{file} should be semantic-index eligible"
3031            );
3032        }
3033    }
3034
3035    #[test]
3036    fn transient_marker_round_trips_and_classifies() {
3037        // A marked transient error is recognized and the marker is stripped for
3038        // display, leaving a clean message.
3039        let marked = format!("{TRANSIENT_EMBEDDING_MARKER}openai compatible request failed: error sending request for url (http://localhost:1234/v1/embeddings)");
3040        assert!(embedding_failure_is_transient(&marked));
3041        let clean = strip_transient_embedding_marker(&marked);
3042        assert!(!clean.contains(TRANSIENT_EMBEDDING_MARKER));
3043        assert!(clean.starts_with("openai compatible request failed:"));
3044
3045        // Permanent errors (HTTP 4xx, dimension mismatch) carry no marker and
3046        // are not classified transient — they must fail fast.
3047        for permanent in [
3048            "openai compatible request failed (HTTP 401): Unauthorized",
3049            "embedding dimension mismatch: index has 384, model returned 768",
3050            "too many files (>20000) for semantic indexing (max 20000)",
3051        ] {
3052            assert!(
3053                !embedding_failure_is_transient(permanent),
3054                "{permanent:?} must not be transient"
3055            );
3056            // Stripping a marker-free string is a no-op.
3057            assert_eq!(strip_transient_embedding_marker(permanent), permanent);
3058        }
3059    }
3060
3061    #[test]
3062    fn send_error_transience_separates_connect_timeout_from_4xx() {
3063        // 5xx / 429 are transient; other client errors are not.
3064        assert!(is_retryable_embedding_status(
3065            reqwest::StatusCode::INTERNAL_SERVER_ERROR
3066        ));
3067        assert!(is_retryable_embedding_status(
3068            reqwest::StatusCode::TOO_MANY_REQUESTS
3069        ));
3070        assert!(!is_retryable_embedding_status(
3071            reqwest::StatusCode::UNAUTHORIZED
3072        ));
3073        assert!(!is_retryable_embedding_status(
3074            reqwest::StatusCode::BAD_REQUEST
3075        ));
3076    }
3077
3078    #[test]
3079    fn local_backend_model_loading_body_is_transient() {
3080        // LM Studio / Ollama return a 4xx with a loading/unloaded message while
3081        // the model swaps; these must classify transient so the build self-heals.
3082        for body in [
3083            r#"{"error":"Model was unloaded while the request was still in queue.."}"#,
3084            r#"{"error":"model is loading, please wait"}"#,
3085            r#"{"error":"Model not loaded"}"#,
3086            "Loading model into memory",
3087        ] {
3088            assert!(
3089                embedding_response_body_is_transient(body),
3090                "{body:?} should be body-transient"
3091            );
3092        }
3093
3094        // A genuine 4xx misconfiguration body must NOT be treated as transient.
3095        for body in [
3096            r#"{"error":"invalid api key"}"#,
3097            r#"{"error":"model 'foo' not found"}"#,
3098            "Bad Request: unknown field",
3099        ] {
3100            assert!(
3101                !embedding_response_body_is_transient(body),
3102                "{body:?} must not be body-transient"
3103            );
3104        }
3105    }
3106
3107    fn start_mock_http_server<F>(handler: F) -> (String, thread::JoinHandle<()>)
3108    where
3109        F: Fn(String, String, String) -> String + Send + 'static,
3110    {
3111        let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
3112        let addr = listener.local_addr().expect("local addr");
3113        let handle = thread::spawn(move || {
3114            let (mut stream, _) = listener.accept().expect("accept request");
3115            let mut buf = Vec::new();
3116            let mut chunk = [0u8; 4096];
3117            let mut header_end = None;
3118            let mut content_length = 0usize;
3119            loop {
3120                let n = stream.read(&mut chunk).expect("read request");
3121                if n == 0 {
3122                    break;
3123                }
3124                buf.extend_from_slice(&chunk[..n]);
3125                if header_end.is_none() {
3126                    if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
3127                        header_end = Some(pos + 4);
3128                        let headers = String::from_utf8_lossy(&buf[..pos + 4]);
3129                        for line in headers.lines() {
3130                            if let Some(value) = line.strip_prefix("Content-Length:") {
3131                                content_length = value.trim().parse::<usize>().unwrap_or(0);
3132                            }
3133                        }
3134                    }
3135                }
3136                if let Some(end) = header_end {
3137                    if buf.len() >= end + content_length {
3138                        break;
3139                    }
3140                }
3141            }
3142
3143            let end = header_end.expect("header terminator");
3144            let request = String::from_utf8_lossy(&buf[..end]).to_string();
3145            let body = String::from_utf8_lossy(&buf[end..end + content_length]).to_string();
3146            let mut lines = request.lines();
3147            let request_line = lines.next().expect("request line").to_string();
3148            let path = request_line
3149                .split_whitespace()
3150                .nth(1)
3151                .expect("request path")
3152                .to_string();
3153            let response_body = handler(request_line, path, body);
3154            let response = format!(
3155                "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
3156                response_body.len(),
3157                response_body
3158            );
3159            stream
3160                .write_all(response.as_bytes())
3161                .expect("write response");
3162        });
3163
3164        (format!("http://{}", addr), handle)
3165    }
3166
3167    fn test_vector_for_texts(texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
3168        Ok(texts.iter().map(|_| vec![1.0, 0.0, 0.0]).collect())
3169    }
3170
3171    fn write_rust_file(path: &Path, function_name: &str) {
3172        fs::write(
3173            path,
3174            format!("pub fn {function_name}() -> bool {{\n    true\n}}\n"),
3175        )
3176        .unwrap();
3177    }
3178
3179    fn build_test_index(project_root: &Path, files: &[PathBuf]) -> SemanticIndex {
3180        let mut embed = test_vector_for_texts;
3181        SemanticIndex::build(project_root, files, &mut embed, 8).unwrap()
3182    }
3183
3184    fn test_project_root() -> PathBuf {
3185        std::env::current_dir().unwrap()
3186    }
3187
3188    fn set_file_metadata(index: &mut SemanticIndex, file: &Path, mtime: SystemTime, size: u64) {
3189        index.file_mtimes.insert(file.to_path_buf(), mtime);
3190        index.file_sizes.insert(file.to_path_buf(), size);
3191        index
3192            .file_hashes
3193            .insert(file.to_path_buf(), cache_freshness::zero_hash());
3194    }
3195
3196    #[test]
3197    fn semantic_cache_serialization_skips_paths_outside_project_root() {
3198        let dir = tempfile::tempdir().expect("create temp dir");
3199        let project = fs::canonicalize(dir.path()).expect("canonical project");
3200        let outside = project.join("..").join("outside.rs");
3201        let mut index = SemanticIndex::new(project.clone(), 3);
3202        index
3203            .file_mtimes
3204            .insert(outside.clone(), SystemTime::UNIX_EPOCH);
3205        index.file_sizes.insert(outside.clone(), 1);
3206        index
3207            .file_hashes
3208            .insert(outside.clone(), cache_freshness::zero_hash());
3209        index.entries.push(EmbeddingEntry {
3210            chunk: SemanticChunk {
3211                file: outside,
3212                name: "outside".to_string(),
3213                kind: SymbolKind::Function,
3214                start_line: 0,
3215                end_line: 0,
3216                exported: false,
3217                embed_text: "outside".to_string(),
3218                snippet: "outside".to_string(),
3219            },
3220            vector: vec![1.0, 0.0, 0.0],
3221        });
3222
3223        let bytes = index.to_bytes();
3224        let loaded = SemanticIndex::from_bytes(&bytes, &project).expect("load serialized index");
3225        assert_eq!(loaded.entries.len(), 0);
3226        assert!(loaded.file_mtimes.is_empty());
3227    }
3228
3229    #[test]
3230    fn test_cosine_similarity_identical() {
3231        let a = vec![1.0, 0.0, 0.0];
3232        let b = vec![1.0, 0.0, 0.0];
3233        assert!((cosine_similarity(&a, &b) - 1.0).abs() < 0.001);
3234    }
3235
3236    #[test]
3237    fn test_cosine_similarity_orthogonal() {
3238        let a = vec![1.0, 0.0, 0.0];
3239        let b = vec![0.0, 1.0, 0.0];
3240        assert!(cosine_similarity(&a, &b).abs() < 0.001);
3241    }
3242
3243    #[test]
3244    fn test_cosine_similarity_opposite() {
3245        let a = vec![1.0, 0.0, 0.0];
3246        let b = vec![-1.0, 0.0, 0.0];
3247        assert!((cosine_similarity(&a, &b) + 1.0).abs() < 0.001);
3248    }
3249
3250    #[test]
3251    fn test_serialization_roundtrip() {
3252        let project_root = test_project_root();
3253        let file = project_root.join("src/main.rs");
3254        let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
3255        index.entries.push(EmbeddingEntry {
3256            chunk: SemanticChunk {
3257                file: file.clone(),
3258                name: "handle_request".to_string(),
3259                kind: SymbolKind::Function,
3260                start_line: 10,
3261                end_line: 25,
3262                exported: true,
3263                embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
3264                snippet: "fn handle_request() {\n  // ...\n}".to_string(),
3265            },
3266            vector: vec![0.1, 0.2, 0.3, 0.4],
3267        });
3268        index.dimension = 4;
3269        index
3270            .file_mtimes
3271            .insert(file.clone(), SystemTime::UNIX_EPOCH);
3272        index.file_sizes.insert(file, 0);
3273        index.set_fingerprint(SemanticIndexFingerprint {
3274            backend: "fastembed".to_string(),
3275            model: "all-MiniLM-L6-v2".to_string(),
3276            base_url: FALLBACK_BACKEND.to_string(),
3277            dimension: 4,
3278            chunking_version: default_chunking_version(),
3279        });
3280
3281        let bytes = index.to_bytes();
3282        let restored = SemanticIndex::from_bytes(&bytes, &project_root).unwrap();
3283
3284        assert_eq!(restored.entries.len(), 1);
3285        assert_eq!(restored.entries[0].chunk.name, "handle_request");
3286        assert_eq!(restored.entries[0].vector, vec![0.1, 0.2, 0.3, 0.4]);
3287        assert_eq!(restored.dimension, 4);
3288        assert_eq!(restored.backend_label(), Some("fastembed"));
3289        assert_eq!(restored.model_label(), Some("all-MiniLM-L6-v2"));
3290    }
3291
3292    #[test]
3293    fn symbol_kind_serialization_roundtrip_includes_file_summary_variant() {
3294        let cases = [
3295            (SymbolKind::Function, 0),
3296            (SymbolKind::Class, 1),
3297            (SymbolKind::Method, 2),
3298            (SymbolKind::Struct, 3),
3299            (SymbolKind::Interface, 4),
3300            (SymbolKind::Enum, 5),
3301            (SymbolKind::TypeAlias, 6),
3302            (SymbolKind::Variable, 7),
3303            (SymbolKind::Heading, 8),
3304            (SymbolKind::FileSummary, 9),
3305        ];
3306
3307        for (kind, encoded) in cases {
3308            assert_eq!(symbol_kind_to_u8(&kind), encoded);
3309            assert_eq!(u8_to_symbol_kind(encoded), kind);
3310        }
3311    }
3312
3313    #[test]
3314    fn test_search_top_k() {
3315        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3316        index.dimension = 3;
3317
3318        // Add entries with known vectors
3319        for (i, name) in ["auth", "database", "handler"].iter().enumerate() {
3320            let mut vec = vec![0.0f32; 3];
3321            vec[i] = 1.0; // orthogonal vectors
3322            index.entries.push(EmbeddingEntry {
3323                chunk: SemanticChunk {
3324                    file: PathBuf::from("/src/lib.rs"),
3325                    name: name.to_string(),
3326                    kind: SymbolKind::Function,
3327                    start_line: (i * 10 + 1) as u32,
3328                    end_line: (i * 10 + 5) as u32,
3329                    exported: true,
3330                    embed_text: format!("kind:function name:{}", name),
3331                    snippet: format!("fn {}() {{}}", name),
3332                },
3333                vector: vec,
3334            });
3335        }
3336
3337        // Query aligned with "auth" (index 0)
3338        let query = vec![0.9, 0.1, 0.0];
3339        let results = index.search(&query, 2);
3340
3341        assert_eq!(results.len(), 2);
3342        assert_eq!(results[0].name, "auth"); // highest score
3343        assert!(results[0].score > results[1].score);
3344    }
3345
3346    #[test]
3347    fn test_empty_index_search() {
3348        let index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3349        let results = index.search(&[0.1, 0.2, 0.3], 10);
3350        assert!(results.is_empty());
3351    }
3352
3353    #[test]
3354    fn single_line_symbol_builds_non_empty_snippet() {
3355        let symbol = Symbol {
3356            name: "answer".to_string(),
3357            kind: SymbolKind::Variable,
3358            range: crate::symbols::Range {
3359                start_line: 0,
3360                start_col: 0,
3361                end_line: 0,
3362                end_col: 24,
3363            },
3364            signature: Some("const answer = 42".to_string()),
3365            scope_chain: Vec::new(),
3366            exported: true,
3367            parent: None,
3368        };
3369        let source = "export const answer = 42;\n";
3370
3371        let snippet = build_snippet(&symbol, source);
3372
3373        assert_eq!(snippet, "export const answer = 42;");
3374    }
3375
3376    #[test]
3377    fn optimized_file_chunk_collection_matches_file_parser_path() {
3378        let project_root = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
3379        let file = project_root.join("src/semantic_index.rs");
3380        let source = std::fs::read_to_string(&file).unwrap();
3381
3382        let mut legacy_parser = FileParser::new();
3383        let legacy_symbols = legacy_parser.extract_symbols(&file).unwrap();
3384        let legacy_chunks = symbols_to_chunks(&file, &legacy_symbols, &source, &project_root);
3385
3386        let mut parsers = HashMap::new();
3387        let optimized_chunks = collect_file_chunks(&project_root, &file, &mut parsers).unwrap();
3388
3389        assert_eq!(
3390            chunk_fingerprint(&optimized_chunks),
3391            chunk_fingerprint(&legacy_chunks)
3392        );
3393    }
3394
3395    fn chunk_fingerprint(
3396        chunks: &[SemanticChunk],
3397    ) -> Vec<(String, SymbolKind, u32, u32, bool, String, String)> {
3398        chunks
3399            .iter()
3400            .map(|chunk| {
3401                (
3402                    chunk.name.clone(),
3403                    chunk.kind.clone(),
3404                    chunk.start_line,
3405                    chunk.end_line,
3406                    chunk.exported,
3407                    chunk.embed_text.clone(),
3408                    chunk.snippet.clone(),
3409                )
3410            })
3411            .collect()
3412    }
3413
3414    #[test]
3415    fn rejects_oversized_dimension_during_deserialization() {
3416        let mut bytes = Vec::new();
3417        bytes.push(1u8);
3418        bytes.extend_from_slice(&((MAX_DIMENSION as u32) + 1).to_le_bytes());
3419        bytes.extend_from_slice(&0u32.to_le_bytes());
3420        bytes.extend_from_slice(&0u32.to_le_bytes());
3421
3422        assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
3423    }
3424
3425    #[test]
3426    fn rejects_oversized_entry_count_during_deserialization() {
3427        let mut bytes = Vec::new();
3428        bytes.push(1u8);
3429        bytes.extend_from_slice(&(DEFAULT_DIMENSION as u32).to_le_bytes());
3430        bytes.extend_from_slice(&((MAX_ENTRIES as u32) + 1).to_le_bytes());
3431        bytes.extend_from_slice(&0u32.to_le_bytes());
3432
3433        assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
3434    }
3435
3436    #[test]
3437    fn invalidate_file_removes_entries_and_mtime() {
3438        let target = PathBuf::from("/src/main.rs");
3439        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3440        index.entries.push(EmbeddingEntry {
3441            chunk: SemanticChunk {
3442                file: target.clone(),
3443                name: "main".to_string(),
3444                kind: SymbolKind::Function,
3445                start_line: 0,
3446                end_line: 1,
3447                exported: false,
3448                embed_text: "main".to_string(),
3449                snippet: "fn main() {}".to_string(),
3450            },
3451            vector: vec![1.0; DEFAULT_DIMENSION],
3452        });
3453        index
3454            .file_mtimes
3455            .insert(target.clone(), SystemTime::UNIX_EPOCH);
3456        index.file_sizes.insert(target.clone(), 0);
3457
3458        index.invalidate_file(&target);
3459
3460        assert!(index.entries.is_empty());
3461        assert!(!index.file_mtimes.contains_key(&target));
3462        assert!(!index.file_sizes.contains_key(&target));
3463    }
3464
3465    #[test]
3466    fn refresh_missing_changed_file_is_purged_after_collect() {
3467        let temp = tempfile::tempdir().unwrap();
3468        let project_root = temp.path();
3469        let file = project_root.join("src/lib.rs");
3470        fs::create_dir_all(file.parent().unwrap()).unwrap();
3471        write_rust_file(&file, "vanished_symbol");
3472
3473        let mut index = build_test_index(project_root, std::slice::from_ref(&file));
3474        let original_size = *index.file_sizes.get(&file).unwrap();
3475        set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, original_size + 1);
3476        fs::remove_file(&file).unwrap();
3477
3478        let mut embed = test_vector_for_texts;
3479        let mut progress = |_done: usize, _total: usize| {};
3480        let summary = index
3481            .refresh_stale_files(
3482                project_root,
3483                std::slice::from_ref(&file),
3484                &mut embed,
3485                8,
3486                &mut progress,
3487            )
3488            .unwrap();
3489
3490        assert_eq!(summary.changed, 0);
3491        assert_eq!(summary.added, 0);
3492        assert_eq!(summary.deleted, 1);
3493        assert!(index.entries.is_empty());
3494        assert!(!index.file_mtimes.contains_key(&file));
3495        assert!(!index.file_sizes.contains_key(&file));
3496        assert!(!index.file_hashes.contains_key(&file));
3497    }
3498
3499    #[test]
3500    fn refresh_collect_error_for_existing_path_preserves_cached_entry() {
3501        let temp = tempfile::tempdir().unwrap();
3502        let project_root = temp.path();
3503        let file = project_root.join("src/lib.rs");
3504        fs::create_dir_all(file.parent().unwrap()).unwrap();
3505        write_rust_file(&file, "kept_symbol");
3506
3507        let mut index = build_test_index(project_root, std::slice::from_ref(&file));
3508        let original_entry_count = index.entries.len();
3509        let original_mtime = *index.file_mtimes.get(&file).unwrap();
3510        let original_size = *index.file_sizes.get(&file).unwrap();
3511
3512        let stale_mtime = SystemTime::UNIX_EPOCH;
3513        set_file_metadata(&mut index, &file, stale_mtime, original_size + 1);
3514        fs::remove_file(&file).unwrap();
3515        fs::create_dir(&file).unwrap();
3516
3517        let mut embed = test_vector_for_texts;
3518        let mut progress = |_done: usize, _total: usize| {};
3519        let summary = index
3520            .refresh_stale_files(
3521                project_root,
3522                std::slice::from_ref(&file),
3523                &mut embed,
3524                8,
3525                &mut progress,
3526            )
3527            .unwrap();
3528
3529        assert_eq!(summary.changed, 0);
3530        assert_eq!(summary.added, 0);
3531        assert_eq!(summary.deleted, 0);
3532        assert_eq!(index.entries.len(), original_entry_count);
3533        assert!(index
3534            .entries
3535            .iter()
3536            .any(|entry| entry.chunk.name == "kept_symbol"));
3537        assert_eq!(index.file_mtimes.get(&file), Some(&stale_mtime));
3538        assert_ne!(index.file_mtimes.get(&file), Some(&original_mtime));
3539        assert_eq!(index.file_sizes.get(&file), Some(&(original_size + 1)));
3540    }
3541
3542    #[test]
3543    fn refresh_never_indexed_file_error_does_not_record_mtime() {
3544        let temp = tempfile::tempdir().unwrap();
3545        let project_root = temp.path();
3546        let missing = project_root.join("src/missing.rs");
3547        fs::create_dir_all(missing.parent().unwrap()).unwrap();
3548
3549        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3550        let mut embed = test_vector_for_texts;
3551        let mut progress = |_done: usize, _total: usize| {};
3552        let summary = index
3553            .refresh_stale_files(
3554                project_root,
3555                std::slice::from_ref(&missing),
3556                &mut embed,
3557                8,
3558                &mut progress,
3559            )
3560            .unwrap();
3561
3562        assert_eq!(summary.added, 0);
3563        assert_eq!(summary.changed, 0);
3564        assert_eq!(summary.deleted, 0);
3565        assert!(!index.file_mtimes.contains_key(&missing));
3566        assert!(!index.file_sizes.contains_key(&missing));
3567        assert!(index.entries.is_empty());
3568    }
3569
3570    #[test]
3571    fn refresh_reports_added_for_new_files() {
3572        let temp = tempfile::tempdir().unwrap();
3573        let project_root = temp.path();
3574        let existing = project_root.join("src/lib.rs");
3575        let added = project_root.join("src/new.rs");
3576        fs::create_dir_all(existing.parent().unwrap()).unwrap();
3577        write_rust_file(&existing, "existing_symbol");
3578        write_rust_file(&added, "added_symbol");
3579
3580        let mut index = build_test_index(project_root, std::slice::from_ref(&existing));
3581        let mut embed = test_vector_for_texts;
3582        let mut progress = |_done: usize, _total: usize| {};
3583        let summary = index
3584            .refresh_stale_files(
3585                project_root,
3586                &[existing.clone(), added.clone()],
3587                &mut embed,
3588                8,
3589                &mut progress,
3590            )
3591            .unwrap();
3592
3593        assert_eq!(summary.added, 1);
3594        assert_eq!(summary.changed, 0);
3595        assert_eq!(summary.deleted, 0);
3596        assert_eq!(summary.total_processed, 2);
3597        assert!(index.file_mtimes.contains_key(&added));
3598        assert!(index.entries.iter().any(|entry| entry.chunk.file == added));
3599    }
3600
3601    #[test]
3602    fn refresh_reports_deleted_for_removed_files() {
3603        let temp = tempfile::tempdir().unwrap();
3604        let project_root = temp.path();
3605        let deleted = project_root.join("src/deleted.rs");
3606        fs::create_dir_all(deleted.parent().unwrap()).unwrap();
3607        write_rust_file(&deleted, "deleted_symbol");
3608
3609        let mut index = build_test_index(project_root, std::slice::from_ref(&deleted));
3610        fs::remove_file(&deleted).unwrap();
3611
3612        let mut embed = test_vector_for_texts;
3613        let mut progress = |_done: usize, _total: usize| {};
3614        let summary = index
3615            .refresh_stale_files(project_root, &[], &mut embed, 8, &mut progress)
3616            .unwrap();
3617
3618        assert_eq!(summary.deleted, 1);
3619        assert_eq!(summary.changed, 0);
3620        assert_eq!(summary.added, 0);
3621        assert_eq!(summary.total_processed, 1);
3622        assert!(!index.file_mtimes.contains_key(&deleted));
3623        assert!(index.entries.is_empty());
3624    }
3625
3626    #[test]
3627    fn refresh_reports_changed_for_modified_files() {
3628        let temp = tempfile::tempdir().unwrap();
3629        let project_root = temp.path();
3630        let file = project_root.join("src/lib.rs");
3631        fs::create_dir_all(file.parent().unwrap()).unwrap();
3632        write_rust_file(&file, "old_symbol");
3633
3634        let mut index = build_test_index(project_root, std::slice::from_ref(&file));
3635        set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, 0);
3636        write_rust_file(&file, "new_symbol");
3637
3638        let mut embed = test_vector_for_texts;
3639        let mut progress = |_done: usize, _total: usize| {};
3640        let summary = index
3641            .refresh_stale_files(
3642                project_root,
3643                std::slice::from_ref(&file),
3644                &mut embed,
3645                8,
3646                &mut progress,
3647            )
3648            .unwrap();
3649
3650        assert_eq!(summary.changed, 1);
3651        assert_eq!(summary.added, 0);
3652        assert_eq!(summary.deleted, 0);
3653        assert_eq!(summary.total_processed, 1);
3654        assert!(index
3655            .entries
3656            .iter()
3657            .any(|entry| entry.chunk.name == "new_symbol"));
3658        assert!(!index
3659            .entries
3660            .iter()
3661            .any(|entry| entry.chunk.name == "old_symbol"));
3662    }
3663
3664    #[test]
3665    fn refresh_all_clean_reports_zero_counts_and_no_embedding_work() {
3666        let temp = tempfile::tempdir().unwrap();
3667        let project_root = temp.path();
3668        let file = project_root.join("src/lib.rs");
3669        fs::create_dir_all(file.parent().unwrap()).unwrap();
3670        write_rust_file(&file, "clean_symbol");
3671
3672        let mut index = build_test_index(project_root, std::slice::from_ref(&file));
3673        let original_entries = index.entries.len();
3674        let mut embed_called = false;
3675        let mut embed = |texts: Vec<String>| {
3676            embed_called = true;
3677            test_vector_for_texts(texts)
3678        };
3679        let mut progress = |_done: usize, _total: usize| {};
3680        let summary = index
3681            .refresh_stale_files(
3682                project_root,
3683                std::slice::from_ref(&file),
3684                &mut embed,
3685                8,
3686                &mut progress,
3687            )
3688            .unwrap();
3689
3690        assert!(summary.is_noop());
3691        assert_eq!(summary.total_processed, 1);
3692        assert!(!embed_called);
3693        assert_eq!(index.entries.len(), original_entries);
3694    }
3695
3696    #[test]
3697    fn detects_missing_onnx_runtime_from_dynamic_load_error() {
3698        let message = "Failed to load ONNX Runtime shared library libonnxruntime.dylib via dlopen: no such file";
3699
3700        assert!(is_onnx_runtime_unavailable(message));
3701    }
3702
3703    #[test]
3704    fn formats_missing_onnx_runtime_with_install_hint() {
3705        let message = format_embedding_init_error(
3706            "Failed to load ONNX Runtime shared library libonnxruntime.so via dlopen: no such file",
3707        );
3708
3709        assert!(message.starts_with("ONNX Runtime not found. Install via:"));
3710        assert!(message.contains("Original error:"));
3711    }
3712
3713    #[test]
3714    fn openai_compatible_backend_embeds_with_mock_server() {
3715        let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
3716            assert!(request_line.starts_with("POST "));
3717            assert_eq!(path, "/v1/embeddings");
3718            "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0},{\"embedding\":[0.4,0.5,0.6],\"index\":1}]}".to_string()
3719        });
3720
3721        let config = SemanticBackendConfig {
3722            backend: SemanticBackend::OpenAiCompatible,
3723            model: "test-embedding".to_string(),
3724            base_url: Some(base_url),
3725            api_key_env: None,
3726            timeout_ms: 5_000,
3727            max_batch_size: 64,
3728            max_files: 20_000,
3729        };
3730
3731        let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
3732        let vectors = model
3733            .embed(vec!["hello".to_string(), "world".to_string()])
3734            .unwrap();
3735
3736        assert_eq!(vectors, vec![vec![0.1, 0.2, 0.3], vec![0.4, 0.5, 0.6]]);
3737        handle.join().unwrap();
3738    }
3739
3740    /// Regression for issue #36: AFT was sending TWO Content-Type headers
3741    /// on the OpenAI embeddings request — once implicitly via `.json(&body)`
3742    /// and again explicitly via `.header("Content-Type", "application/json")`.
3743    /// reqwest's `.header()` calls `HeaderMap::append`, which produces two
3744    /// headers on the wire. OpenAI's /v1/embeddings endpoint rejects that
3745    /// with `HTTP 400 "you must provide a model parameter"` even though the
3746    /// body actually contains `model`. The fix is to drop the explicit
3747    /// `.header("Content-Type", ...)` call. This test pins that we send
3748    /// exactly one Content-Type header.
3749    #[test]
3750    fn openai_compatible_request_has_single_content_type_header() {
3751        use std::sync::{Arc, Mutex};
3752        let captured: Arc<Mutex<Vec<u8>>> = Arc::new(Mutex::new(Vec::new()));
3753        let captured_for_thread = Arc::clone(&captured);
3754
3755        let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
3756        let addr = listener.local_addr().expect("local addr");
3757        let handle = thread::spawn(move || {
3758            let (mut stream, _) = listener.accept().expect("accept");
3759            let mut buf = Vec::new();
3760            let mut chunk = [0u8; 4096];
3761            let mut header_end = None;
3762            let mut content_length = 0usize;
3763            loop {
3764                let n = stream.read(&mut chunk).expect("read");
3765                if n == 0 {
3766                    break;
3767                }
3768                buf.extend_from_slice(&chunk[..n]);
3769                if header_end.is_none() {
3770                    if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
3771                        header_end = Some(pos + 4);
3772                        for line in String::from_utf8_lossy(&buf[..pos + 4]).lines() {
3773                            if let Some(value) = line.strip_prefix("Content-Length:") {
3774                                content_length = value.trim().parse::<usize>().unwrap_or(0);
3775                            }
3776                        }
3777                    }
3778                }
3779                if let Some(end) = header_end {
3780                    if buf.len() >= end + content_length {
3781                        break;
3782                    }
3783                }
3784            }
3785            *captured_for_thread.lock().unwrap() = buf;
3786            let body = "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0}]}";
3787            let response = format!(
3788                "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
3789                body.len(),
3790                body
3791            );
3792            let _ = stream.write_all(response.as_bytes());
3793        });
3794
3795        let config = SemanticBackendConfig {
3796            backend: SemanticBackend::OpenAiCompatible,
3797            model: "text-embedding-3-small".to_string(),
3798            base_url: Some(format!("http://{}", addr)),
3799            api_key_env: None,
3800            timeout_ms: 5_000,
3801            max_batch_size: 64,
3802            max_files: 20_000,
3803        };
3804        let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
3805        let _ = model.embed(vec!["probe".to_string()]).unwrap();
3806        handle.join().unwrap();
3807
3808        let bytes = captured.lock().unwrap().clone();
3809        let request = String::from_utf8_lossy(&bytes);
3810
3811        // Lowercase line counts because HTTP headers are case-insensitive
3812        // and reqwest may emit `content-type` in lowercase under HTTP/2.
3813        let content_type_lines = request
3814            .lines()
3815            .filter(|line| {
3816                let lower = line.to_ascii_lowercase();
3817                lower.starts_with("content-type:")
3818            })
3819            .count();
3820        assert_eq!(
3821            content_type_lines, 1,
3822            "expected exactly one Content-Type header but found {content_type_lines}; full request:\n{request}",
3823        );
3824
3825        // The body must still include the model field — pin this so a future
3826        // change can't accidentally drop `model` while fixing duplicate headers.
3827        assert!(
3828            request.contains(r#""model":"text-embedding-3-small""#),
3829            "request body should contain model field; full request:\n{request}",
3830        );
3831    }
3832
3833    #[test]
3834    fn ollama_backend_embeds_with_mock_server() {
3835        let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
3836            assert!(request_line.starts_with("POST "));
3837            assert_eq!(path, "/api/embed");
3838            "{\"embeddings\":[[0.7,0.8,0.9],[1.0,1.1,1.2]]}".to_string()
3839        });
3840
3841        let config = SemanticBackendConfig {
3842            backend: SemanticBackend::Ollama,
3843            model: "embeddinggemma".to_string(),
3844            base_url: Some(base_url),
3845            api_key_env: None,
3846            timeout_ms: 5_000,
3847            max_batch_size: 64,
3848            max_files: 20_000,
3849        };
3850
3851        let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
3852        let vectors = model
3853            .embed(vec!["hello".to_string(), "world".to_string()])
3854            .unwrap();
3855
3856        assert_eq!(vectors, vec![vec![0.7, 0.8, 0.9], vec![1.0, 1.1, 1.2]]);
3857        handle.join().unwrap();
3858    }
3859
3860    #[test]
3861    fn read_from_disk_rejects_fingerprint_mismatch() {
3862        let storage = tempfile::tempdir().unwrap();
3863        let project_key = "proj";
3864
3865        let project_root = test_project_root();
3866        let file = project_root.join("src/main.rs");
3867        let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
3868        index.entries.push(EmbeddingEntry {
3869            chunk: SemanticChunk {
3870                file: file.clone(),
3871                name: "handle_request".to_string(),
3872                kind: SymbolKind::Function,
3873                start_line: 10,
3874                end_line: 25,
3875                exported: true,
3876                embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
3877                snippet: "fn handle_request() {}".to_string(),
3878            },
3879            vector: vec![0.1, 0.2, 0.3],
3880        });
3881        index.dimension = 3;
3882        index
3883            .file_mtimes
3884            .insert(file.clone(), SystemTime::UNIX_EPOCH);
3885        index.file_sizes.insert(file, 0);
3886        index.set_fingerprint(SemanticIndexFingerprint {
3887            backend: "openai_compatible".to_string(),
3888            model: "test-embedding".to_string(),
3889            base_url: "http://127.0.0.1:1234/v1".to_string(),
3890            dimension: 3,
3891            chunking_version: default_chunking_version(),
3892        });
3893        index.write_to_disk(storage.path(), project_key);
3894
3895        let matching = index.fingerprint().unwrap().as_string();
3896        assert!(SemanticIndex::read_from_disk(
3897            storage.path(),
3898            project_key,
3899            &project_root,
3900            false,
3901            Some(&matching),
3902        )
3903        .is_some());
3904
3905        let mismatched = SemanticIndexFingerprint {
3906            backend: "ollama".to_string(),
3907            model: "embeddinggemma".to_string(),
3908            base_url: "http://127.0.0.1:11434".to_string(),
3909            dimension: 3,
3910            chunking_version: default_chunking_version(),
3911        }
3912        .as_string();
3913        assert!(SemanticIndex::read_from_disk(
3914            storage.path(),
3915            project_key,
3916            &project_root,
3917            false,
3918            Some(&mismatched),
3919        )
3920        .is_none());
3921    }
3922
3923    #[test]
3924    fn read_from_disk_rejects_v3_cache_for_snippet_rebuild() {
3925        let storage = tempfile::tempdir().unwrap();
3926        let project_key = "proj-v3";
3927        let dir = storage.path().join("semantic").join(project_key);
3928        fs::create_dir_all(&dir).unwrap();
3929
3930        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3931        index.entries.push(EmbeddingEntry {
3932            chunk: SemanticChunk {
3933                file: PathBuf::from("/src/main.rs"),
3934                name: "handle_request".to_string(),
3935                kind: SymbolKind::Function,
3936                start_line: 0,
3937                end_line: 0,
3938                exported: true,
3939                embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
3940                snippet: "fn handle_request() {}".to_string(),
3941            },
3942            vector: vec![0.1, 0.2, 0.3],
3943        });
3944        index.dimension = 3;
3945        index
3946            .file_mtimes
3947            .insert(PathBuf::from("/src/main.rs"), SystemTime::UNIX_EPOCH);
3948        index.file_sizes.insert(PathBuf::from("/src/main.rs"), 0);
3949        let fingerprint = SemanticIndexFingerprint {
3950            backend: "fastembed".to_string(),
3951            model: "test".to_string(),
3952            base_url: FALLBACK_BACKEND.to_string(),
3953            dimension: 3,
3954            chunking_version: default_chunking_version(),
3955        };
3956        index.set_fingerprint(fingerprint.clone());
3957
3958        let mut bytes = index.to_bytes();
3959        bytes[0] = SEMANTIC_INDEX_VERSION_V3;
3960        fs::write(dir.join("semantic.bin"), bytes).unwrap();
3961
3962        assert!(SemanticIndex::read_from_disk(
3963            storage.path(),
3964            project_key,
3965            &test_project_root(),
3966            false,
3967            Some(&fingerprint.as_string())
3968        )
3969        .is_none());
3970        assert!(!dir.join("semantic.bin").exists());
3971    }
3972
3973    fn make_symbol(kind: SymbolKind, name: &str, start: u32, end: u32) -> crate::symbols::Symbol {
3974        crate::symbols::Symbol {
3975            name: name.to_string(),
3976            kind,
3977            range: crate::symbols::Range {
3978                start_line: start,
3979                start_col: 0,
3980                end_line: end,
3981                end_col: 0,
3982            },
3983            signature: None,
3984            scope_chain: Vec::new(),
3985            exported: false,
3986            parent: None,
3987        }
3988    }
3989
3990    /// Heading symbols (Markdown / HTML headings) must NOT be indexed —
3991    /// they overwhelmingly dominated semantic results even on code-shaped
3992    /// queries because heading prose embeds far more strongly than code
3993    /// chunks. Skipping headings keeps aft_search a code-finder.
3994    #[test]
3995    fn symbols_to_chunks_skips_heading_symbols() {
3996        let project_root = PathBuf::from("/proj");
3997        let file = project_root.join("README.md");
3998        let source = "# Title\n\nbody text\n\n## Section\n\nmore text\n";
3999
4000        let symbols = vec![
4001            make_symbol(SymbolKind::Heading, "Title", 0, 2),
4002            make_symbol(SymbolKind::Heading, "Section", 4, 6),
4003        ];
4004
4005        let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
4006        assert!(
4007            chunks.is_empty(),
4008            "Heading symbols must be filtered out before embedding; got {} chunk(s)",
4009            chunks.len()
4010        );
4011    }
4012
4013    /// A symbol with an enormous signature (e.g. a YAML/Kubernetes CronJob
4014    /// whose inline `command:` script is parsed into the signature) must not
4015    /// produce an embed_text that overflows the embedding backend's physical
4016    /// batch. Before the clamp, the unbounded `signature:` append created a
4017    /// multi-KB input that aborted the whole index build and degraded every
4018    /// search to lexical-only.
4019    #[test]
4020    fn build_embed_text_clamps_oversized_signature() {
4021        let project_root = PathBuf::from("/proj");
4022        let file = project_root.join("cronjob.yaml");
4023        let huge_sig = "kubectl ".repeat(2000); // ~16 KB
4024        let source = "apiVersion: batch/v1\nkind: CronJob\n";
4025
4026        let mut symbol = make_symbol(SymbolKind::Class, "cluster-janitor", 0, 1);
4027        symbol.signature = Some(huge_sig);
4028
4029        let text = build_embed_text(&symbol, source, &file, &project_root);
4030        assert!(
4031            text.chars().count() <= MAX_EMBED_TEXT_CHARS,
4032            "embed_text must be clamped to {} chars, got {}",
4033            MAX_EMBED_TEXT_CHARS,
4034            text.chars().count()
4035        );
4036    }
4037
4038    /// Code symbols (functions, classes, methods, structs, etc.) must still
4039    /// be indexed alongside the heading skip — otherwise we'd starve the
4040    /// index entirely.
4041    #[test]
4042    fn symbols_to_chunks_keeps_code_symbols_alongside_skipped_headings() {
4043        let project_root = PathBuf::from("/proj");
4044        let file = project_root.join("src/lib.rs");
4045        let source = "pub fn handle_request() -> bool {\n    true\n}\n";
4046
4047        let symbols = vec![
4048            // A heading mixed in (e.g. from a doc comment block elsewhere).
4049            make_symbol(SymbolKind::Heading, "doc heading", 0, 1),
4050            make_symbol(SymbolKind::Function, "handle_request", 0, 2),
4051            make_symbol(SymbolKind::Struct, "AuthService", 4, 6),
4052        ];
4053
4054        let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
4055        assert_eq!(
4056            chunks.len(),
4057            3,
4058            "Expected file-summary + 2 code chunks (Function + Struct), got {}",
4059            chunks.len()
4060        );
4061        let names: Vec<&str> = chunks.iter().map(|c| c.name.as_str()).collect();
4062        assert!(chunks
4063            .iter()
4064            .any(|chunk| matches!(chunk.kind, SymbolKind::FileSummary)));
4065        assert!(names.contains(&"handle_request"));
4066        assert!(names.contains(&"AuthService"));
4067        assert!(
4068            !names.contains(&"doc heading"),
4069            "Heading symbol leaked into chunks: {names:?}"
4070        );
4071    }
4072
4073    #[test]
4074    fn validate_ssrf_allows_loopback_hostnames() {
4075        // Loopback hostnames are explicitly allowed so self-hosted backends
4076        // (Ollama at http://localhost:11434) work at their default config.
4077        for host in &[
4078            "http://localhost",
4079            "http://localhost:8080",
4080            "http://localhost:11434", // Ollama default
4081            "http://localhost.localdomain",
4082            "http://foo.localhost",
4083        ] {
4084            assert!(
4085                validate_base_url_no_ssrf(host).is_ok(),
4086                "Expected {host} to be allowed (loopback), got: {:?}",
4087                validate_base_url_no_ssrf(host)
4088            );
4089        }
4090    }
4091
4092    #[test]
4093    fn validate_ssrf_allows_loopback_ips() {
4094        // 127.0.0.0/8 is loopback — by definition same-machine and not an
4095        // SSRF target. Allow it so Ollama at http://127.0.0.1:11434 works.
4096        for url in &[
4097            "http://127.0.0.1",
4098            "http://127.0.0.1:11434", // Ollama default
4099            "http://127.0.0.1:8080",
4100            "http://127.1.2.3",
4101        ] {
4102            let result = validate_base_url_no_ssrf(url);
4103            assert!(
4104                result.is_ok(),
4105                "Expected {url} to be allowed (loopback), got: {:?}",
4106                result
4107            );
4108        }
4109    }
4110
4111    #[test]
4112    fn validate_ssrf_rejects_private_non_loopback_ips() {
4113        // Non-loopback private/reserved IPs remain rejected — homelab/intranet
4114        // services on LAN IPs are real SSRF targets even though the user
4115        // configured them. Users who want this can opt in by binding the
4116        // service to a public-routable address.
4117        for url in &[
4118            "http://192.168.1.1",
4119            "http://10.0.0.1",
4120            "http://172.16.0.1",
4121            "http://169.254.169.254",
4122            "http://100.64.0.1",
4123        ] {
4124            let result = validate_base_url_no_ssrf(url);
4125            assert!(
4126                result.is_err(),
4127                "Expected {url} to be rejected (non-loopback private), got: {:?}",
4128                result
4129            );
4130        }
4131    }
4132
4133    #[test]
4134    fn validate_ssrf_rejects_mdns_local_hostnames() {
4135        // mDNS .local hostnames typically resolve to LAN devices, not
4136        // loopback. Rejecting them before DNS lookup gives a clearer error.
4137        for host in &[
4138            "http://printer.local",
4139            "http://nas.local:8080",
4140            "http://homelab.local",
4141        ] {
4142            let result = validate_base_url_no_ssrf(host);
4143            assert!(
4144                result.is_err(),
4145                "Expected {host} to be rejected (mDNS), got: {:?}",
4146                result
4147            );
4148        }
4149    }
4150
4151    #[test]
4152    fn normalize_base_url_allows_localhost_for_tests() {
4153        // normalize_base_url itself should NOT block localhost — only
4154        // validate_base_url_no_ssrf does. Tests construct backends directly.
4155        assert!(normalize_base_url("http://127.0.0.1:9999").is_ok());
4156        assert!(normalize_base_url("http://localhost:8080").is_ok());
4157    }
4158
4159    /// Pin the user-facing wording of the ONNX version-mismatch error.
4160    /// The auto-fix path MUST be listed first because it's the only safe
4161    /// option that doesn't require sudo or risk breaking other apps that
4162    /// link the system library. Regression of any of these strings would
4163    /// either mislead users (system rm before auto-fix) or break the
4164    /// `aft doctor --fix` discovery path.
4165    #[test]
4166    fn ort_mismatch_message_recommends_auto_fix_first() {
4167        let msg =
4168            format_ort_version_mismatch("1.9.0", "/usr/lib/x86_64-linux-gnu/libonnxruntime.so");
4169
4170        // The reported version and path must appear verbatim.
4171        assert!(
4172            msg.contains("v1.9.0"),
4173            "should report detected version: {msg}"
4174        );
4175        assert!(
4176            msg.contains("/usr/lib/x86_64-linux-gnu/libonnxruntime.so"),
4177            "should report system path: {msg}"
4178        );
4179        assert!(msg.contains("v1.20+"), "should state requirement: {msg}");
4180
4181        // Solution ordering: auto-fix is #1, system rm is #2, install is #3.
4182        let auto_fix_pos = msg
4183            .find("Auto-fix")
4184            .expect("Auto-fix solution missing — users won't discover --fix");
4185        let remove_pos = msg
4186            .find("Remove the old library")
4187            .expect("system-rm solution missing");
4188        assert!(
4189            auto_fix_pos < remove_pos,
4190            "Auto-fix must come before manual rm — see PR comment thread"
4191        );
4192
4193        // The auto-fix command must be runnable as-is on a fresh system.
4194        assert!(
4195            msg.contains("npx @cortexkit/aft doctor --fix"),
4196            "auto-fix command must be present and copy-pasteable: {msg}"
4197        );
4198    }
4199
4200    /// macOS dylib paths must not produce a malformed message when the
4201    /// system path lacks a trailing slash. This is a regression guard
4202    /// for the "{}\n{}" format string contract.
4203    #[test]
4204    fn ort_mismatch_message_handles_macos_dylib_path() {
4205        let msg = format_ort_version_mismatch("1.9.0", "/opt/homebrew/lib/libonnxruntime.dylib");
4206        assert!(msg.contains("v1.9.0"));
4207        assert!(msg.contains("/opt/homebrew/lib/libonnxruntime.dylib"));
4208        // The dylib path must appear in the auto-fix paragraph (single
4209        // quotes around it) AND in the manual-rm paragraph; verify
4210        // both placements survived the format string.
4211        assert!(
4212            msg.contains("'/opt/homebrew/lib/libonnxruntime.dylib'"),
4213            "system path should be quoted in the auto-fix sentence: {msg}"
4214        );
4215    }
4216}
aft/semantic_index.rs

aft/
semantic_index.rs