Skip to main content

aft/
semantic_index.rs

1use crate::cache_freshness::{self, FileFreshness, FreshnessVerdict};
2use crate::config::{SemanticBackend, SemanticBackendConfig};
3use crate::fs_lock;
4use crate::parser::{detect_language, extract_symbols_from_tree, grammar_for};
5use crate::search_index::{cache_relative_path, cached_path_under_root};
6use crate::symbols::{Symbol, SymbolKind};
7use crate::{slog_info, slog_warn};
8
9use crate::local_embed::LocalEmbedder;
10use rayon::prelude::*;
11use reqwest::blocking::Client;
12use serde::{Deserialize, Serialize};
13use std::collections::{HashMap, HashSet, VecDeque};
14use std::env;
15use std::fmt::Display;
16use std::fs;
17use std::io::{self, BufReader, BufWriter, Cursor, Read, Write};
18use std::path::{Path, PathBuf};
19use std::sync::Mutex;
20use std::time::Duration;
21use std::time::SystemTime;
22use tree_sitter::Parser;
23use url::Url;
24
25const DEFAULT_DIMENSION: usize = 384;
26const MAX_ENTRIES: usize = 1_000_000;
27// Covers high-dimensional backends such as OpenAI text-embedding-3-large (3072)
28// and common local models (4096) while keeping a bounded supported shape.
29const MAX_DIMENSION: usize = 4096;
30const F32_BYTES: usize = std::mem::size_of::<f32>();
31const HEADER_BYTES_V1: usize = 9;
32const HEADER_BYTES_V2: usize = 13;
33const ONNX_RUNTIME_INSTALL_HINT: &str =
34    "ONNX Runtime not found. Install via: brew install onnxruntime (macOS), \
35     apt install libonnxruntime (Linux), or place onnxruntime.dll in your PATH (Windows). \
36     AFT can auto-download ONNX Runtime — run `npx @cortexkit/aft doctor` to diagnose.";
37
38const SEMANTIC_INDEX_VERSION_V1: u8 = 1;
39const SEMANTIC_INDEX_VERSION_V2: u8 = 2;
40/// V3 adds subsec_nanos to the file-mtime table so staleness detection survives
41/// restart round-trips on filesystems with subsecond mtime precision (APFS,
42/// ext4 with nsec, NTFS). V1/V2 persisted whole-second mtimes only, which
43/// caused every restart to flag ~99% of files as stale and re-embed them.
44const SEMANTIC_INDEX_VERSION_V3: u8 = 3;
45/// V4 keeps the V3 on-disk layout but rebuilds persisted snippets once after
46/// fixing symbol ranges that were incorrectly treated as 1-based.
47const SEMANTIC_INDEX_VERSION_V4: u8 = 4;
48/// V5 adds file sizes to the file metadata table so incremental staleness
49/// detection can catch content changes even when mtime precision misses them.
50const SEMANTIC_INDEX_VERSION_V5: u8 = 5;
51/// V6 stores paths relative to project_root and adds content hashes.
52const SEMANTIC_INDEX_VERSION_V6: u8 = 6;
53const DEFAULT_OPENAI_EMBEDDING_PATH: &str = "/embeddings";
54const DEFAULT_OLLAMA_EMBEDDING_PATH: &str = "/api/embed";
55// Build/refresh embedding requests keep a larger budget because they run on
56// background workers and often batch many texts through a cold local backend.
57const DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS: u64 = 25_000;
58// Interactive query embedding runs inside semantic_search dispatch; keep it
59// short so slow/unreachable remote backends degrade to lexical quickly.
60const DEFAULT_QUERY_EMBEDDING_TIMEOUT_MS: u64 = 8_000;
61const DEFAULT_MAX_BATCH_SIZE: usize = 64;
62const QUERY_EMBEDDING_CACHE_CAP: usize = 1_000;
63const FALLBACK_BACKEND: &str = "none";
64const EMBEDDING_REQUEST_MAX_ATTEMPTS: usize = 3;
65const EMBEDDING_REQUEST_BACKOFF_MS: [u64; 2] = [500, 1_000];
66static SEMANTIC_LOCK_ACQUIRE_MUTEX: Mutex<()> = Mutex::new(());
67
68pub struct SemanticIndexLock {
69    _guard: fs_lock::LockGuard,
70}
71
72impl SemanticIndexLock {
73    pub fn acquire(storage_dir: &Path, project_key: &str) -> std::io::Result<Self> {
74        let dir = storage_dir.join("semantic").join(project_key);
75        fs::create_dir_all(&dir)?;
76        let path = dir.join("cache.lock");
77        let _acquire_guard = SEMANTIC_LOCK_ACQUIRE_MUTEX
78            .lock()
79            .map_err(|_| std::io::Error::other("semantic cache lock acquisition mutex poisoned"))?;
80        fs_lock::try_acquire(&path, Duration::from_secs(2))
81            .map(|guard| Self { _guard: guard })
82            .map_err(|error| match error {
83                fs_lock::AcquireError::Timeout => {
84                    std::io::Error::other("timed out acquiring semantic cache lock")
85                }
86                fs_lock::AcquireError::Io(error) => error,
87            })
88    }
89}
90
91#[derive(Debug, Clone, Serialize, Deserialize)]
92pub struct SemanticIndexFingerprint {
93    pub backend: String,
94    pub model: String,
95    #[serde(default)]
96    pub base_url: String,
97    pub dimension: usize,
98    #[serde(default = "default_chunking_version")]
99    pub chunking_version: u32,
100}
101
102fn default_chunking_version() -> u32 {
103    2
104}
105
106impl SemanticIndexFingerprint {
107    fn from_config(config: &SemanticBackendConfig, dimension: usize) -> Self {
108        // Use normalized URL for fingerprinting so cosmetic differences
109        // (e.g. "http://host/v1" vs "http://host/v1/") don't cause rebuilds.
110        let base_url = config
111            .base_url
112            .as_ref()
113            .and_then(|u| normalize_base_url(u).ok())
114            .unwrap_or_else(|| FALLBACK_BACKEND.to_string());
115        Self {
116            backend: config.backend.as_str().to_string(),
117            model: config.model.clone(),
118            base_url,
119            dimension,
120            chunking_version: default_chunking_version(),
121        }
122    }
123
124    pub fn as_string(&self) -> String {
125        serde_json::to_string(self).unwrap_or_else(|_| String::new())
126    }
127
128    fn matches_expected(&self, expected: &str) -> bool {
129        let encoded = self.as_string();
130        !encoded.is_empty() && encoded == expected
131    }
132}
133
134enum SemanticEmbeddingEngine {
135    /// Local ONNX embedder (all-MiniLM-L6-v2 via raw `ort`). The config-facing
136    /// backend string stays "fastembed" for index-fingerprint compatibility.
137    Local(LocalEmbedder),
138    OpenAiCompatible {
139        client: Client,
140        model: String,
141        base_url: String,
142        api_key: Option<String>,
143    },
144    Ollama {
145        client: Client,
146        model: String,
147        base_url: String,
148    },
149}
150
151pub struct SemanticEmbeddingModel {
152    backend: SemanticBackend,
153    model: String,
154    base_url: Option<String>,
155    timeout_ms: u64,
156    max_batch_size: usize,
157    dimension: Option<usize>,
158    engine: SemanticEmbeddingEngine,
159    query_embedding_cache: HashMap<String, Vec<f32>>,
160    query_embedding_cache_order: VecDeque<String>,
161    query_embedding_cache_hits: u64,
162    query_embedding_cache_misses: u64,
163}
164
165pub type EmbeddingModel = SemanticEmbeddingModel;
166
167fn validate_embedding_batch(
168    vectors: &[Vec<f32>],
169    expected_count: usize,
170    context: &str,
171) -> Result<(), String> {
172    if expected_count > 0 && vectors.is_empty() {
173        return Err(format!(
174            "{context} returned no vectors for {expected_count} inputs"
175        ));
176    }
177
178    if vectors.len() != expected_count {
179        return Err(format!(
180            "{context} returned {} vectors for {} inputs",
181            vectors.len(),
182            expected_count
183        ));
184    }
185
186    let Some(first_vector) = vectors.first() else {
187        return Ok(());
188    };
189    let expected_dimension = first_vector.len();
190    validate_embedding_dimension(expected_dimension)
191        .map_err(|error| format!("{context} returned {error}"))?;
192    for (index, vector) in vectors.iter().enumerate() {
193        if vector.len() != expected_dimension {
194            return Err(format!(
195                "{context} returned inconsistent embedding dimensions: vector 0 has length {expected_dimension}, vector {index} has length {}",
196                vector.len()
197            ));
198        }
199    }
200
201    Ok(())
202}
203
204fn validate_embedding_dimension(dimension: usize) -> Result<(), String> {
205    if dimension == 0 || dimension > MAX_DIMENSION {
206        return Err(format!(
207            "invalid embedding dimension: {dimension}; supported range is 1..={MAX_DIMENSION}"
208        ));
209    }
210
211    Ok(())
212}
213
214/// Normalize a base URL: validate scheme and strip trailing slash.
215/// Does NOT perform SSRF/private-IP validation — call
216/// `validate_base_url_no_ssrf` separately when processing user-supplied config.
217fn normalize_base_url(raw: &str) -> Result<String, String> {
218    let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
219    let scheme = parsed.scheme();
220    if scheme != "http" && scheme != "https" {
221        return Err(format!(
222            "unsupported URL scheme '{}' — only http:// and https:// are allowed",
223            scheme
224        ));
225    }
226    Ok(parsed.to_string().trim_end_matches('/').to_string())
227}
228
229/// Validate that a base URL does not point to a private/loopback address.
230/// Call this on user-supplied config (at configure time) to prevent SSRF.
231/// Not called for programmatically constructed configs (e.g. tests).
232///
233/// **Loopback is allowed.** Self-hosted embedding backends (e.g. Ollama at
234/// `http://127.0.0.1:11434`) are a primary use case for `aft_search`. Loopback
235/// addresses by definition cannot be exploited as SSRF targets — they only
236/// reach services on the same machine. Allowing loopback unblocks Ollama at its
237/// default config without opening up SSRF to LAN/intranet services, which
238/// remain rejected.
239///
240/// **mDNS `.local` is rejected.** mDNS hostnames typically resolve to LAN
241/// devices (printers, homelab servers); rejecting them before DNS lookup keeps
242/// the SSRF guard meaningful for non-loopback private networks.
243pub fn validate_base_url_no_ssrf(raw: &str) -> Result<(), String> {
244    use std::net::{IpAddr, ToSocketAddrs};
245
246    let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
247
248    let host = parsed.host_str().unwrap_or("");
249
250    // Loopback hostnames are explicitly allowed. RFC 6761 mandates that
251    // `localhost` and `*.localhost` resolve to loopback;
252    // `localhost.localdomain` is a historical alias used on some Linux
253    // distros. Self-hosted backends like Ollama use these by default.
254    let is_loopback_host =
255        host == "localhost" || host == "localhost.localdomain" || host.ends_with(".localhost");
256    if is_loopback_host {
257        return Ok(());
258    }
259
260    // mDNS hostnames are typically LAN devices, not loopback. Reject before
261    // DNS lookup so users get a clear error rather than a private-IP error.
262    if host.ends_with(".local") {
263        return Err(format!(
264            "base_url host '{host}' is an mDNS name — only loopback (localhost / 127.0.0.1) and public endpoints are allowed"
265        ));
266    }
267
268    // Resolve the hostname. Reject private/link-local/CGNAT IPs but NOT
269    // loopback (which is by definition same-machine and not an SSRF target).
270    let port = parsed.port_or_known_default().unwrap_or(443);
271    let addr_str = format!("{host}:{port}");
272    let addrs: Vec<IpAddr> = addr_str
273        .to_socket_addrs()
274        .map(|iter| iter.map(|sa| sa.ip()).collect())
275        .unwrap_or_default();
276    for ip in &addrs {
277        if is_private_non_loopback_ip(ip) {
278            return Err(format!(
279                "base_url '{raw}' resolves to a private/reserved IP — only loopback (127.0.0.1) and public endpoints are allowed"
280            ));
281        }
282    }
283
284    Ok(())
285}
286
287/// Returns true for IPv4/IPv6 addresses in private/link-local/CGNAT/benchmark/
288/// multicast/reserved ranges, EXCLUDING loopback (127.0.0.0/8 and ::1). Loopback
289/// is considered safe for SSRF purposes (same-machine, e.g. a local Ollama
290/// endpoint) — see [`validate_base_url_no_ssrf`] for rationale.
291///
292/// Delegates to [`crate::url_fetch::is_private_or_reserved_ip`] so there is one
293/// authoritative reserved-range list (the url_fetch copy is the maintained one;
294/// this used to be a drifting subset that missed e.g. 198.18.0.0/15 and the
295/// multicast/reserved blocks). We only re-add the loopback carve-out the
296/// url_fetch guard deliberately does not make.
297fn is_private_non_loopback_ip(ip: &std::net::IpAddr) -> bool {
298    // Canonicalize so an IPv4-mapped loopback (`::ffff:127.0.0.1`) is also
299    // recognized as loopback, matching the prior carve-out.
300    if ip.to_canonical().is_loopback() {
301        return false;
302    }
303    crate::url_fetch::is_private_or_reserved_ip(*ip)
304}
305
306fn build_openai_embeddings_endpoint(base_url: &str) -> String {
307    if base_url.ends_with("/v1") {
308        format!("{base_url}{DEFAULT_OPENAI_EMBEDDING_PATH}")
309    } else {
310        format!("{base_url}/v1{}", DEFAULT_OPENAI_EMBEDDING_PATH)
311    }
312}
313
314fn build_ollama_embeddings_endpoint(base_url: &str) -> String {
315    if base_url.ends_with("/api") {
316        format!("{base_url}/embed")
317    } else {
318        format!("{base_url}{DEFAULT_OLLAMA_EMBEDDING_PATH}")
319    }
320}
321
322fn normalize_api_key(value: Option<String>) -> Option<String> {
323    value.and_then(|token| {
324        let token = token.trim();
325        if token.is_empty() {
326            None
327        } else {
328            Some(token.to_string())
329        }
330    })
331}
332
333fn is_retryable_embedding_status(status: reqwest::StatusCode) -> bool {
334    status.is_server_error() || status == reqwest::StatusCode::TOO_MANY_REQUESTS
335}
336
337/// Local backends (LM Studio, Ollama, llama.cpp) can return a 4xx — usually
338/// 400/409 — while a model is loading or was just unloaded. Only narrowly known
339/// local-backend loading/unloaded payloads are classified transient; generic
340/// 4xx bodies that merely mention phrases like "loading model" remain
341/// permanent so misconfigurations do not retry forever.
342fn embedding_response_body_is_transient(status: reqwest::StatusCode, raw: &str) -> bool {
343    if !matches!(
344        status,
345        reqwest::StatusCode::BAD_REQUEST
346            | reqwest::StatusCode::CONFLICT
347            | reqwest::StatusCode::REQUEST_TIMEOUT
348            | reqwest::StatusCode::LOCKED
349            | reqwest::StatusCode::TOO_EARLY
350    ) {
351        return false;
352    }
353
354    let lower = raw.to_ascii_lowercase();
355    let normalized = lower.trim();
356
357    normalized.contains("model was unloaded while the request was still in queue")
358        || normalized == "model is loading"
359        || normalized.starts_with("model is loading,")
360        || normalized.contains(r#""error":"model is loading"#)
361        || normalized.contains(r#""message":"model is loading"#)
362        || normalized == "model not loaded"
363        || normalized.contains(r#""error":"model not loaded""#)
364        || normalized.contains(r#""message":"model not loaded""#)
365        || normalized == "loading model into memory"
366        || normalized.contains(r#""error":"loading model into memory""#)
367        || normalized.contains(r#""message":"loading model into memory""#)
368        || normalized == "model is being loaded"
369        || normalized.contains(r#""error":"model is being loaded""#)
370        || normalized.contains(r#""message":"model is being loaded""#)
371        || normalized == "model is currently loading"
372        || normalized.contains(r#""error":"model is currently loading""#)
373        || normalized.contains(r#""message":"model is currently loading""#)
374}
375
376fn is_retryable_embedding_error(error: &reqwest::Error) -> bool {
377    error.is_connect()
378}
379
380/// Whether a send-time error means the backend is *unreachable or temporarily
381/// failing* (vs. a real misconfiguration). Broader than the in-request retry
382/// predicate: a per-request timeout is transient for the build/refresh layer
383/// (the model may still be cold-loading) but we don't burn the 3 fast
384/// in-request attempts on it — the build-level retry rides it out instead.
385fn embedding_send_error_is_transient(error: &reqwest::Error) -> bool {
386    error.is_connect() || error.is_timeout()
387}
388
389fn embedding_response_read_error_is_transient(error: &reqwest::Error) -> bool {
390    embedding_send_error_is_transient(error) || error.is_body() || error.is_decode()
391}
392
393/// Stable machine marker prefixed onto embedding error strings whose root cause
394/// is transient — the backend is down, timing out, or returning 5xx/429, not
395/// misconfigured. The build and corpus-refresh layers key retry-vs-give-up on
396/// this marker (see [`embedding_failure_is_transient`]) instead of re-parsing
397/// error text, so transience stays authoritative at the one site that knows it.
398/// Stripped before any user-facing display via [`strip_transient_embedding_marker`].
399pub const TRANSIENT_EMBEDDING_MARKER: &str = "[transient] ";
400
401/// True when an embedding error carries the transient marker — i.e. retrying
402/// once the backend recovers is the right move, not surfacing a hard failure.
403pub fn embedding_failure_is_transient(error: &str) -> bool {
404    error.contains(TRANSIENT_EMBEDDING_MARKER)
405}
406
407/// Remove the machine transient marker so the message is clean for display.
408pub fn strip_transient_embedding_marker(error: &str) -> String {
409    error.replace(TRANSIENT_EMBEDDING_MARKER, "")
410}
411
412fn sleep_before_embedding_retry(attempt_index: usize) {
413    if let Some(delay_ms) = EMBEDDING_REQUEST_BACKOFF_MS.get(attempt_index) {
414        std::thread::sleep(Duration::from_millis(*delay_ms));
415    }
416}
417
418fn send_embedding_request<F>(mut make_request: F, backend_label: &str) -> Result<String, String>
419where
420    F: FnMut() -> reqwest::blocking::RequestBuilder,
421{
422    for attempt_index in 0..EMBEDDING_REQUEST_MAX_ATTEMPTS {
423        let last_attempt = attempt_index + 1 == EMBEDDING_REQUEST_MAX_ATTEMPTS;
424
425        let response = match make_request().send() {
426            Ok(response) => response,
427            Err(error) => {
428                if !last_attempt && is_retryable_embedding_error(&error) {
429                    sleep_before_embedding_retry(attempt_index);
430                    continue;
431                }
432                // Connect/timeout failures mean the backend is unreachable or
433                // cold-loading — mark transient so the build layer rides it out
434                // and self-heals instead of parking the index in `Failed`.
435                let marker = if embedding_send_error_is_transient(&error) {
436                    TRANSIENT_EMBEDDING_MARKER
437                } else {
438                    ""
439                };
440                return Err(format!("{marker}{backend_label} request failed: {error}"));
441            }
442        };
443
444        let status = response.status();
445        let raw = match response.text() {
446            Ok(raw) => raw,
447            Err(error) => {
448                if !last_attempt && embedding_response_read_error_is_transient(&error) {
449                    sleep_before_embedding_retry(attempt_index);
450                    continue;
451                }
452                let marker = if embedding_response_read_error_is_transient(&error) {
453                    TRANSIENT_EMBEDDING_MARKER
454                } else {
455                    ""
456                };
457                return Err(format!(
458                    "{marker}{backend_label} response read failed: {error}"
459                ));
460            }
461        };
462
463        if status.is_success() {
464            return Ok(raw);
465        }
466
467        // A 4xx whose body says the model is loading/unloaded is transient on
468        // local backends (LM Studio/Ollama), so treat it like a retryable
469        // status: ride it out at both the in-request and build-retry layers.
470        let body_transient = embedding_response_body_is_transient(status, &raw);
471        if !last_attempt && (is_retryable_embedding_status(status) || body_transient) {
472            sleep_before_embedding_retry(attempt_index);
473            continue;
474        }
475
476        // 5xx / 429 are server-side and transient — the backend is overloaded
477        // or briefly unavailable, not misconfigured. A 4xx whose body indicates
478        // the model is (un)loading is also transient (local backend mid-swap).
479        // Other 4xx (auth, bad request, model-not-found) is a real error the
480        // user must fix; no marker.
481        let marker = if is_retryable_embedding_status(status) || body_transient {
482            TRANSIENT_EMBEDDING_MARKER
483        } else {
484            ""
485        };
486        return Err(format!(
487            "{marker}{backend_label} request failed (HTTP {}): {}",
488            status, raw
489        ));
490    }
491
492    unreachable!("embedding request retries exhausted without returning")
493}
494
495fn configured_embedding_timeout_ms(config: &SemanticBackendConfig) -> u64 {
496    if config.timeout_ms == 0 {
497        DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS
498    } else {
499        config.timeout_ms
500    }
501}
502
503impl SemanticEmbeddingModel {
504    pub fn from_config(config: &SemanticBackendConfig) -> Result<Self, String> {
505        Self::from_config_with_timeout_ms(config, configured_embedding_timeout_ms(config))
506    }
507
508    pub fn from_config_for_query(config: &SemanticBackendConfig) -> Result<Self, String> {
509        let timeout_ms =
510            configured_embedding_timeout_ms(config).min(DEFAULT_QUERY_EMBEDDING_TIMEOUT_MS);
511        Self::from_config_with_timeout_ms(config, timeout_ms)
512    }
513
514    fn from_config_with_timeout_ms(
515        config: &SemanticBackendConfig,
516        timeout_ms: u64,
517    ) -> Result<Self, String> {
518        let max_batch_size = if config.max_batch_size == 0 {
519            DEFAULT_MAX_BATCH_SIZE
520        } else {
521            config.max_batch_size
522        };
523
524        let api_key_env = normalize_api_key(config.api_key_env.clone());
525        let model = config.model.clone();
526
527        let client = Client::builder()
528            .timeout(Duration::from_millis(timeout_ms))
529            .redirect(reqwest::redirect::Policy::none())
530            .build()
531            .map_err(|error| format!("failed to configure embedding client: {error}"))?;
532
533        let engine = match config.backend {
534            SemanticBackend::Fastembed => {
535                SemanticEmbeddingEngine::Local(LocalEmbedder::new(&model)?)
536            }
537            SemanticBackend::OpenAiCompatible => {
538                let raw = config.base_url.as_ref().ok_or_else(|| {
539                    "base_url is required for openai_compatible backend".to_string()
540                })?;
541                let base_url = normalize_base_url(raw)?;
542
543                let api_key = match api_key_env {
544                    Some(var_name) => Some(env::var(&var_name).map_err(|_| {
545                        format!("missing api_key_env '{var_name}' for openai_compatible backend")
546                    })?),
547                    None => None,
548                };
549
550                SemanticEmbeddingEngine::OpenAiCompatible {
551                    client,
552                    model,
553                    base_url,
554                    api_key,
555                }
556            }
557            SemanticBackend::Ollama => {
558                let raw = config
559                    .base_url
560                    .as_ref()
561                    .ok_or_else(|| "base_url is required for ollama backend".to_string())?;
562                let base_url = normalize_base_url(raw)?;
563
564                SemanticEmbeddingEngine::Ollama {
565                    client,
566                    model,
567                    base_url,
568                }
569            }
570        };
571
572        Ok(Self {
573            backend: config.backend,
574            model: config.model.clone(),
575            base_url: config.base_url.clone(),
576            timeout_ms,
577            max_batch_size,
578            dimension: None,
579            engine,
580            query_embedding_cache: HashMap::new(),
581            query_embedding_cache_order: VecDeque::new(),
582            query_embedding_cache_hits: 0,
583            query_embedding_cache_misses: 0,
584        })
585    }
586
587    pub fn backend(&self) -> SemanticBackend {
588        self.backend
589    }
590
591    pub fn model(&self) -> &str {
592        &self.model
593    }
594
595    pub fn base_url(&self) -> Option<&str> {
596        self.base_url.as_deref()
597    }
598
599    pub fn max_batch_size(&self) -> usize {
600        self.max_batch_size
601    }
602
603    pub fn timeout_ms(&self) -> u64 {
604        self.timeout_ms
605    }
606
607    pub fn fingerprint(
608        &mut self,
609        config: &SemanticBackendConfig,
610    ) -> Result<SemanticIndexFingerprint, String> {
611        let dimension = self.dimension()?;
612        Ok(SemanticIndexFingerprint::from_config(config, dimension))
613    }
614
615    pub fn dimension(&mut self) -> Result<usize, String> {
616        if let Some(dimension) = self.dimension {
617            return Ok(dimension);
618        }
619
620        let dimension = match &mut self.engine {
621            SemanticEmbeddingEngine::Local(model) => {
622                let vectors = model.embed(&["semantic index fingerprint probe".to_string()])?;
623                vectors
624                    .first()
625                    .map(|v| v.len())
626                    .ok_or_else(|| "embedding backend returned no vectors".to_string())?
627            }
628            SemanticEmbeddingEngine::OpenAiCompatible { .. } => {
629                let vectors =
630                    self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
631                vectors
632                    .first()
633                    .map(|v| v.len())
634                    .ok_or_else(|| "embedding backend returned no vectors".to_string())?
635            }
636            SemanticEmbeddingEngine::Ollama { .. } => {
637                let vectors =
638                    self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
639                vectors
640                    .first()
641                    .map(|v| v.len())
642                    .ok_or_else(|| "embedding backend returned no vectors".to_string())?
643            }
644        };
645
646        self.dimension = Some(dimension);
647        Ok(dimension)
648    }
649
650    pub fn embed(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
651        self.embed_texts(texts)
652    }
653
654    pub fn embed_query_cached(&mut self, query: &str) -> Result<Vec<f32>, String> {
655        if let Some(vector) = self.query_embedding_cache.get(query) {
656            self.query_embedding_cache_hits += 1;
657            return Ok(vector.clone());
658        }
659
660        self.query_embedding_cache_misses += 1;
661        let embeddings = self.embed_texts(vec![query.to_string()])?;
662        let vector = embeddings
663            .first()
664            .cloned()
665            .ok_or_else(|| "embedding model returned no query vector".to_string())?;
666
667        if self.query_embedding_cache.len() >= QUERY_EMBEDDING_CACHE_CAP {
668            if let Some(oldest) = self.query_embedding_cache_order.pop_front() {
669                self.query_embedding_cache.remove(&oldest);
670            }
671        }
672        self.query_embedding_cache
673            .insert(query.to_string(), vector.clone());
674        self.query_embedding_cache_order
675            .push_back(query.to_string());
676
677        Ok(vector)
678    }
679
680    pub fn query_embedding_cache_stats(&self) -> (u64, u64, usize) {
681        (
682            self.query_embedding_cache_hits,
683            self.query_embedding_cache_misses,
684            self.query_embedding_cache.len(),
685        )
686    }
687
688    fn embed_texts(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
689        match &mut self.engine {
690            SemanticEmbeddingEngine::Local(model) => model
691                .embed(&texts)
692                .map_err(|error| format!("failed to embed batch: {error}")),
693            SemanticEmbeddingEngine::OpenAiCompatible {
694                client,
695                model,
696                base_url,
697                api_key,
698            } => {
699                let expected_text_count = texts.len();
700                let endpoint = build_openai_embeddings_endpoint(base_url);
701                let body = serde_json::json!({
702                    "input": texts,
703                    "model": model,
704                });
705
706                let raw = send_embedding_request(
707                    || {
708                        // `.json(&body)` sets Content-Type: application/json
709                        // automatically. Do NOT add `.header("Content-Type",
710                        // "application/json")` afterwards — RequestBuilder::header()
711                        // calls HeaderMap::append, which produces TWO Content-Type
712                        // headers on the wire. OpenAI's /v1/embeddings endpoint
713                        // treats duplicate Content-Type as malformed and rejects
714                        // the body with 400 "you must provide a model parameter"
715                        // even when `model` is set. Verified end-to-end against
716                        // api.openai.com. See issue #36.
717                        let mut request = client.post(&endpoint).json(&body);
718
719                        if let Some(api_key) = api_key {
720                            request = request.header("Authorization", format!("Bearer {api_key}"));
721                        }
722
723                        request
724                    },
725                    "openai compatible",
726                )?;
727
728                #[derive(Deserialize)]
729                struct OpenAiResponse {
730                    data: Vec<OpenAiEmbeddingResult>,
731                }
732
733                #[derive(Deserialize)]
734                struct OpenAiEmbeddingResult {
735                    embedding: Vec<f32>,
736                    index: Option<u32>,
737                }
738
739                let parsed: OpenAiResponse = serde_json::from_str(&raw)
740                    .map_err(|error| format!("invalid openai compatible response: {error}"))?;
741                if parsed.data.len() != expected_text_count {
742                    return Err(format!(
743                        "openai compatible response returned {} embeddings for {} inputs",
744                        parsed.data.len(),
745                        expected_text_count
746                    ));
747                }
748
749                let mut vectors = vec![Vec::new(); parsed.data.len()];
750                for (i, item) in parsed.data.into_iter().enumerate() {
751                    let index = item.index.unwrap_or(i as u32) as usize;
752                    if index >= vectors.len() {
753                        return Err(
754                            "openai compatible response contains invalid vector index".to_string()
755                        );
756                    }
757                    vectors[index] = item.embedding;
758                }
759
760                for vector in &vectors {
761                    if vector.is_empty() {
762                        return Err(
763                            "openai compatible response contained missing vectors".to_string()
764                        );
765                    }
766                }
767
768                self.dimension = vectors.first().map(Vec::len);
769                Ok(vectors)
770            }
771            SemanticEmbeddingEngine::Ollama {
772                client,
773                model,
774                base_url,
775            } => {
776                let expected_text_count = texts.len();
777                let endpoint = build_ollama_embeddings_endpoint(base_url);
778
779                #[derive(Serialize)]
780                struct OllamaPayload<'a> {
781                    model: &'a str,
782                    input: Vec<String>,
783                }
784
785                let payload = OllamaPayload {
786                    model,
787                    input: texts,
788                };
789
790                let raw = send_embedding_request(
791                    || {
792                        // `.json(&payload)` sets Content-Type automatically.
793                        // Same duplicate-header trap as the OpenAI branch above
794                        // — most Ollama servers tolerate it, but the
795                        // single-Content-Type form is the correct one.
796                        client.post(&endpoint).json(&payload)
797                    },
798                    "ollama",
799                )?;
800
801                #[derive(Deserialize)]
802                struct OllamaResponse {
803                    embeddings: Vec<Vec<f32>>,
804                }
805
806                let parsed: OllamaResponse = serde_json::from_str(&raw)
807                    .map_err(|error| format!("invalid ollama response: {error}"))?;
808                if parsed.embeddings.is_empty() {
809                    return Err("ollama response returned no embeddings".to_string());
810                }
811                if parsed.embeddings.len() != expected_text_count {
812                    return Err(format!(
813                        "ollama response returned {} embeddings for {} inputs",
814                        parsed.embeddings.len(),
815                        expected_text_count
816                    ));
817                }
818
819                let vectors = parsed.embeddings;
820                for vector in &vectors {
821                    if vector.is_empty() {
822                        return Err("ollama response contained empty embeddings".to_string());
823                    }
824                }
825
826                self.dimension = vectors.first().map(Vec::len);
827                Ok(vectors)
828            }
829        }
830    }
831}
832
833/// Pre-validate ONNX Runtime by attempting a raw dlopen before ort touches it.
834/// This catches broken/incompatible .so files without risking a panic in the ort crate.
835/// Also checks the runtime version via OrtGetApiBase if available.
836pub fn pre_validate_onnx_runtime() -> Result<(), String> {
837    let dylib_path = std::env::var("ORT_DYLIB_PATH").ok();
838
839    #[cfg(any(target_os = "linux", target_os = "macos"))]
840    {
841        #[cfg(target_os = "linux")]
842        let default_name = "libonnxruntime.so";
843        #[cfg(target_os = "macos")]
844        let default_name = "libonnxruntime.dylib";
845
846        let lib_name = dylib_path.as_deref().unwrap_or(default_name);
847
848        unsafe {
849            let c_name = std::ffi::CString::new(lib_name)
850                .map_err(|e| format!("invalid library path: {}", e))?;
851            let handle = libc::dlopen(c_name.as_ptr(), libc::RTLD_NOW);
852            if handle.is_null() {
853                let err = libc::dlerror();
854                let msg = if err.is_null() {
855                    "unknown dlopen error".to_string()
856                } else {
857                    std::ffi::CStr::from_ptr(err).to_string_lossy().into_owned()
858                };
859                return Err(format!(
860                    "ONNX Runtime not found. dlopen('{}') failed: {}. \
861                     Run `npx @cortexkit/aft doctor` to diagnose.",
862                    lib_name, msg
863                ));
864            }
865
866            // Try to detect the runtime version from the actual loaded library
867            // path first. A bare dlopen("libonnxruntime.so") may resolve to an
868            // older system ORT through loader search paths; checking only the
869            // caller-supplied soname would miss that and let ort fail opaquely.
870            let (detected_version, version_source) =
871                detect_ort_version_from_loaded_library(handle, lib_name);
872
873            libc::dlclose(handle);
874
875            // Check version compatibility — we need 1.20+.
876            if let Some(ref version) = detected_version {
877                let parts: Vec<&str> = version.split('.').collect();
878                if let (Some(major), Some(minor)) = (
879                    parts.first().and_then(|s| s.parse::<u32>().ok()),
880                    parts.get(1).and_then(|s| s.parse::<u32>().ok()),
881                ) {
882                    if major != 1 || minor < 20 {
883                        return Err(format_ort_version_mismatch(version, &version_source));
884                    }
885                }
886            }
887        }
888    }
889
890    #[cfg(target_os = "windows")]
891    {
892        // Validate ONNX Runtime availability on Windows by loading the DLL
893        // via LoadLibraryExW before the ort crate attempts its own LoadLibrary.
894        // This way we can produce a friendly error (with installation hints)
895        // instead of a raw LoadLibrary failure from deep inside fastembed.
896        let lib_name = dylib_path.as_deref().unwrap_or("onnxruntime.dll");
897
898        // Use kernel32 LoadLibraryExW for the validation — built-in, no
899        // crate dependency required. GetModuleFileNameW resolves the loaded
900        // DLL path for version probing via the version.dll API.
901        #[link(name = "kernel32")]
902        extern "system" {
903            fn LoadLibraryExW(
904                lpLibFileName: *const u16,
905                hFile: *mut std::ffi::c_void,
906                dwFlags: u32,
907            ) -> *mut std::ffi::c_void;
908            fn FreeLibrary(hLibModule: *mut std::ffi::c_void) -> i32;
909            fn GetModuleFileNameW(
910                hModule: *mut std::ffi::c_void,
911                lpFilename: *mut u16,
912                nSize: u32,
913            ) -> u32;
914        }
915
916        #[link(name = "version")]
917        extern "system" {
918            fn GetFileVersionInfoSizeW(lptstrFilename: *const u16, lpdwHandle: *mut u32) -> u32;
919            fn GetFileVersionInfoW(
920                lptstrFilename: *const u16,
921                dwHandle: u32,
922                dwLen: u32,
923                lpData: *mut std::ffi::c_void,
924            ) -> i32;
925            fn VerQueryValueW(
926                pBlock: *mut std::ffi::c_void,
927                lpSubBlock: *const u16,
928                lplpBuffer: *mut *mut std::ffi::c_void,
929                puLen: *mut u32,
930            ) -> i32;
931        }
932
933        #[repr(C)]
934        struct VS_FIXEDFILEINFO {
935            dw_signature: u32,
936            dw_struc_version: u32,
937            dw_file_version_ms: u32, // HIWORD major, LOWORD minor
938            dw_file_version_ls: u32, // HIWORD build, LOWORD revision
939            dw_product_version_ms: u32,
940            dw_product_version_ls: u32,
941            dw_file_flags_mask: u32,
942            dw_file_flags: u32,
943            dw_file_os: u32,
944            dw_file_type: u32,
945            dw_file_subtype: u32,
946            dw_file_date_ms: u32,
947            dw_file_date_ls: u32,
948        }
949
950        unsafe {
951            use std::os::windows::ffi::OsStrExt;
952            let wide: Vec<u16> = std::ffi::OsStr::new(lib_name)
953                .encode_wide()
954                .chain(std::iter::once(0))
955                .collect();
956
957            let handle = LoadLibraryExW(wide.as_ptr(), std::ptr::null_mut(), 0);
958            if handle.is_null() {
959                let err = std::io::Error::last_os_error();
960                return Err(format!(
961                    "ONNX Runtime not found. LoadLibraryExW('{}') failed: {}. \
962                     Run `npx @cortexkit/aft doctor` to diagnose.",
963                    lib_name, err
964                ));
965            }
966
967            // Probe the file version from PE resources so we can reject
968            // outdated DLLs (e.g. v1.9.x) before the ort crate panics.
969            let mut detected_major: u32 = 0;
970            let mut detected_minor: u32 = 0;
971            // Use MAX_UNICODEPATH (32767) so deeply nested ORT paths (e.g.
972            // long NuGet package paths under %USERPROFILE%) never truncate.
973            // GetModuleFileNameW truncates silently when the buffer is too
974            // small, which causes version probing to fail and the version
975            // check to be bypassed — better to allocate generously.
976            let mut path_buf = [0u16; 32767];
977            let path_len = GetModuleFileNameW(handle, path_buf.as_mut_ptr(), 32767);
978            if path_len > 0 {
979                let mut dummy_handle: u32 = 0;
980                let info_size = GetFileVersionInfoSizeW(path_buf.as_ptr(), &mut dummy_handle);
981                if info_size > 0 {
982                    let mut info = vec![0u8; info_size as usize];
983                    if GetFileVersionInfoW(
984                        path_buf.as_ptr(),
985                        0,
986                        info_size,
987                        info.as_mut_ptr() as *mut std::ffi::c_void,
988                    ) != 0
989                    {
990                        let sub_block = "\\\0".encode_utf16().collect::<Vec<u16>>();
991                        let mut vs_info: *mut std::ffi::c_void = std::ptr::null_mut();
992                        let mut vs_len: u32 = 0;
993                        if VerQueryValueW(
994                            info.as_mut_ptr() as *mut std::ffi::c_void,
995                            sub_block.as_ptr(),
996                            &mut vs_info,
997                            &mut vs_len,
998                        ) != 0
999                            && !vs_info.is_null()
1000                        {
1001                            let fixed = vs_info as *const VS_FIXEDFILEINFO;
1002                            detected_major = (*fixed).dw_file_version_ms >> 16;
1003                            detected_minor = (*fixed).dw_file_version_ms & 0xFFFF;
1004                        }
1005                    }
1006                }
1007            }
1008
1009            FreeLibrary(handle);
1010
1011            // Version compatibility check (mirrors the Linux/macOS path).
1012            // If version could not be detected (detected_major == 0) we let
1013            // the load succeed — the ort crate will diagnose further.
1014            if detected_major != 0 && (detected_major != 1 || detected_minor < 20) {
1015                let ver = format!("{}.{}", detected_major, detected_minor);
1016                return Err(format_ort_version_mismatch(&ver, lib_name));
1017            }
1018        }
1019    }
1020
1021    Ok(())
1022}
1023
1024#[cfg(any(target_os = "linux", target_os = "macos"))]
1025unsafe fn loaded_library_path_from_handle(handle: *mut std::ffi::c_void) -> Option<String> {
1026    let symbol_name = std::ffi::CString::new("OrtGetApiBase").ok()?;
1027    let symbol = unsafe { libc::dlsym(handle, symbol_name.as_ptr()) };
1028    if symbol.is_null() {
1029        return None;
1030    }
1031
1032    let mut info = std::mem::MaybeUninit::<libc::Dl_info>::uninit();
1033    if unsafe { libc::dladdr(symbol, info.as_mut_ptr()) } == 0 {
1034        return None;
1035    }
1036
1037    let info = unsafe { info.assume_init() };
1038    if info.dli_fname.is_null() {
1039        return None;
1040    }
1041
1042    Some(
1043        unsafe { std::ffi::CStr::from_ptr(info.dli_fname) }
1044            .to_string_lossy()
1045            .into_owned(),
1046    )
1047}
1048
1049#[cfg(any(target_os = "linux", target_os = "macos"))]
1050fn detect_ort_version_from_resolved_or_requested(
1051    resolved_path: Option<String>,
1052    requested_lib_name: &str,
1053) -> (Option<String>, String) {
1054    if let Some(path) = resolved_path {
1055        if let Some(version) = detect_ort_version_from_path(&path) {
1056            return (Some(version), path);
1057        }
1058        return (detect_ort_version_from_path(requested_lib_name), path);
1059    }
1060
1061    (
1062        detect_ort_version_from_path(requested_lib_name),
1063        requested_lib_name.to_string(),
1064    )
1065}
1066
1067#[cfg(any(target_os = "linux", target_os = "macos"))]
1068fn detect_ort_version_from_loaded_library(
1069    handle: *mut std::ffi::c_void,
1070    requested_lib_name: &str,
1071) -> (Option<String>, String) {
1072    detect_ort_version_from_resolved_or_requested(
1073        unsafe { loaded_library_path_from_handle(handle) },
1074        requested_lib_name,
1075    )
1076}
1077
1078/// Try to extract the ORT version from the library filename or resolved symlink.
1079/// Examples: "libonnxruntime.so.1.19.0" → "1.19.0", "libonnxruntime.1.24.4.dylib" → "1.24.4"
1080#[cfg(any(target_os = "linux", target_os = "macos"))]
1081fn detect_ort_version_from_path(lib_path: &str) -> Option<String> {
1082    let path = std::path::Path::new(lib_path);
1083
1084    // Try the path as given, then follow symlinks
1085    for candidate in [Some(path.to_path_buf()), std::fs::canonicalize(path).ok()]
1086        .into_iter()
1087        .flatten()
1088    {
1089        if let Some(name) = candidate.file_name().and_then(|n| n.to_str()) {
1090            if let Some(version) = extract_version_from_filename(name) {
1091                return Some(version);
1092            }
1093        }
1094    }
1095
1096    // Also check for versioned siblings in the same directory
1097    if let Some(parent) = path.parent() {
1098        if let Ok(entries) = std::fs::read_dir(parent) {
1099            for entry in entries.flatten() {
1100                if let Some(name) = entry.file_name().to_str() {
1101                    if name.starts_with("libonnxruntime") {
1102                        if let Some(version) = extract_version_from_filename(name) {
1103                            return Some(version);
1104                        }
1105                    }
1106                }
1107            }
1108        }
1109    }
1110
1111    None
1112}
1113
1114/// Extract version from filenames like "libonnxruntime.so.1.19.0" or "libonnxruntime.1.24.4.dylib"
1115#[cfg(any(target_os = "linux", target_os = "macos"))]
1116fn extract_version_from_filename(name: &str) -> Option<String> {
1117    // Match patterns: .so.X.Y.Z or .X.Y.Z.dylib or .X.Y.Z.so
1118    let re = regex::Regex::new(r"(\d+\.\d+\.\d+)").ok()?;
1119    re.find(name).map(|m| m.as_str().to_string())
1120}
1121
1122fn suggest_removal_command(lib_path: &str) -> String {
1123    if lib_path.starts_with("/usr/local/lib")
1124        || lib_path == "libonnxruntime.so"
1125        || lib_path == "libonnxruntime.dylib"
1126    {
1127        #[cfg(target_os = "linux")]
1128        return "   sudo rm /usr/local/lib/libonnxruntime* && sudo ldconfig".to_string();
1129        #[cfg(target_os = "macos")]
1130        return "   sudo rm /usr/local/lib/libonnxruntime*".to_string();
1131    }
1132    format!("   rm '{}'", lib_path)
1133}
1134
1135/// Build the user-facing error message for an incompatible ONNX Runtime
1136/// install. Extracted as a pure helper so we can unit-test the wording
1137/// stability — the auto-fix recommendation must always come first because
1138/// it's the only safe option, and the system-rm step must remain present
1139/// because some users prefer the system-wide cleanup path.
1140pub(crate) fn format_ort_version_mismatch(version: &str, lib_name: &str) -> String {
1141    format!(
1142        "ONNX Runtime version mismatch: found v{} at '{}', but AFT requires v1.20+. \
1143         Solutions:\n\
1144         1. Auto-fix (recommended): run `npx @cortexkit/aft doctor --fix`. \
1145         This downloads AFT-managed ONNX Runtime v1.24 into AFT's storage and \
1146         configures the bridge to load it instead of the system library — no \
1147         changes to '{}'.\n\
1148         2. Remove the old library and restart (AFT auto-downloads the correct version on next start):\n\
1149         {}\n\
1150         3. Or install ONNX Runtime 1.24 system-wide: https://github.com/microsoft/onnxruntime/releases/tag/v1.24.0\n\
1151         4. Run `npx @cortexkit/aft doctor` for full diagnostics.",
1152        version,
1153        lib_name,
1154        lib_name,
1155        suggest_removal_command(lib_name),
1156    )
1157}
1158
1159pub fn is_onnx_runtime_unavailable(message: &str) -> bool {
1160    if message.trim_start().starts_with("ONNX Runtime not found.") {
1161        return true;
1162    }
1163
1164    let message = message.to_ascii_lowercase();
1165    let mentions_onnx_runtime = ["onnx runtime", "onnxruntime", "libonnxruntime"]
1166        .iter()
1167        .any(|pattern| message.contains(pattern));
1168    let mentions_dynamic_load_failure = [
1169        "shared library",
1170        "dynamic library",
1171        "failed to load",
1172        "could not load",
1173        "unable to load",
1174        "dlopen",
1175        "loadlibrary",
1176        "no such file",
1177        "not found",
1178    ]
1179    .iter()
1180    .any(|pattern| message.contains(pattern));
1181
1182    mentions_onnx_runtime && mentions_dynamic_load_failure
1183}
1184
1185pub fn format_embedding_init_error(error: impl Display) -> String {
1186    let message = error.to_string();
1187
1188    if is_onnx_runtime_unavailable(&message) {
1189        return format!("{ONNX_RUNTIME_INSTALL_HINT} Original error: {message}");
1190    }
1191
1192    format!("failed to initialize semantic embedding model: {message}")
1193}
1194
1195/// A chunk of code ready for embedding — derived from a Symbol with context enrichment
1196#[derive(Debug, Clone)]
1197pub struct SemanticChunk {
1198    /// Absolute file path
1199    pub file: PathBuf,
1200    /// Symbol name
1201    pub name: String,
1202    /// Symbol kind (function, class, struct, etc.)
1203    pub kind: SymbolKind,
1204    /// Line range (0-based internally, inclusive)
1205    pub start_line: u32,
1206    pub end_line: u32,
1207    /// Whether the symbol is exported
1208    pub exported: bool,
1209    /// The enriched text that gets embedded (scope + signature + body snippet)
1210    pub embed_text: String,
1211    /// Short code snippet for display in results
1212    pub snippet: String,
1213}
1214
1215/// A stored embedding entry — chunk metadata + vector
1216#[derive(Debug, Clone)]
1217pub struct EmbeddingEntry {
1218    chunk: SemanticChunk,
1219    vector: Vec<f32>,
1220}
1221
1222/// The semantic index — stores embeddings for all symbols in a project
1223#[derive(Debug, Clone)]
1224pub struct SemanticIndex {
1225    entries: Vec<EmbeddingEntry>,
1226    /// Track which files are indexed and their mtime for staleness detection
1227    file_mtimes: HashMap<PathBuf, SystemTime>,
1228    /// Track indexed file sizes alongside mtimes for staleness detection
1229    file_sizes: HashMap<PathBuf, u64>,
1230    file_hashes: HashMap<PathBuf, blake3::Hash>,
1231    /// Embedding dimension (384 for MiniLM-L6-v2)
1232    dimension: usize,
1233    fingerprint: Option<SemanticIndexFingerprint>,
1234    project_root: PathBuf,
1235    deferred_files: HashSet<PathBuf>,
1236}
1237
1238#[derive(Debug, Clone, Copy)]
1239struct IndexedFileMetadata {
1240    mtime: SystemTime,
1241    size: u64,
1242    content_hash: blake3::Hash,
1243}
1244
1245/// Result of an incremental refresh of the semantic index. Counts are file
1246/// counts; `total_processed` is the number of current/deleted files considered.
1247#[derive(Debug, Default, Clone, Copy)]
1248pub struct RefreshSummary {
1249    pub changed: usize,
1250    pub added: usize,
1251    pub deleted: usize,
1252    pub total_processed: usize,
1253}
1254
1255impl RefreshSummary {
1256    /// True when no files were touched.
1257    pub fn is_noop(&self) -> bool {
1258        self.changed == 0 && self.added == 0 && self.deleted == 0
1259    }
1260}
1261
1262#[derive(Debug, Default)]
1263pub struct InvalidatedFilesRefresh {
1264    /// Full replacement entries for `completed_paths`, not just newly embedded
1265    /// chunks. `apply_refresh_update` removes completed paths before extending
1266    /// this set, so reused chunks must travel in this delta too.
1267    pub added_entries: Vec<EmbeddingEntry>,
1268    pub updated_metadata: Vec<(PathBuf, FileFreshness)>,
1269    pub completed_paths: Vec<PathBuf>,
1270    pub summary: RefreshSummary,
1271}
1272
1273#[derive(Debug, Clone)]
1274struct ReusableEmbedding {
1275    embed_text: String,
1276    vector: Vec<f32>,
1277}
1278
1279type ChunkReuseMap = HashMap<PathBuf, HashMap<blake3::Hash, Vec<ReusableEmbedding>>>;
1280
1281/// Search result from a semantic query
1282#[derive(Debug, Clone)]
1283pub struct SemanticResult {
1284    pub file: PathBuf,
1285    pub name: String,
1286    pub kind: SymbolKind,
1287    pub start_line: u32,
1288    pub end_line: u32,
1289    pub exported: bool,
1290    pub snippet: String,
1291    pub score: f32,
1292    pub source: &'static str,
1293}
1294
1295impl SemanticIndex {
1296    pub fn new(project_root: PathBuf, dimension: usize) -> Self {
1297        debug_assert!(project_root.is_absolute());
1298        Self {
1299            entries: Vec::new(),
1300            file_mtimes: HashMap::new(),
1301            file_sizes: HashMap::new(),
1302            file_hashes: HashMap::new(),
1303            dimension,
1304            fingerprint: None,
1305            project_root,
1306            deferred_files: HashSet::new(),
1307        }
1308    }
1309
1310    /// Number of embedded symbol entries.
1311    pub fn entry_count(&self) -> usize {
1312        self.entries.len()
1313    }
1314
1315    /// Number of files currently tracked by the semantic index.
1316    pub fn indexed_file_count(&self) -> usize {
1317        self.file_mtimes.len()
1318    }
1319
1320    /// Human-readable status label for the index.
1321    pub fn status_label(&self) -> &'static str {
1322        if self.entries.is_empty() {
1323            "empty"
1324        } else {
1325            "ready"
1326        }
1327    }
1328
1329    fn collect_chunks(
1330        project_root: &Path,
1331        files: &[PathBuf],
1332    ) -> (Vec<SemanticChunk>, HashMap<PathBuf, IndexedFileMetadata>) {
1333        let collect_started = std::time::Instant::now();
1334        let per_file: Vec<(
1335            PathBuf,
1336            Result<(IndexedFileMetadata, Vec<SemanticChunk>), String>,
1337        )> = files
1338            .par_iter()
1339            .map_init(HashMap::new, |parsers, file| {
1340                let result = collect_semantic_file(project_root, file, parsers);
1341                (file.clone(), result)
1342            })
1343            .collect();
1344
1345        let mut chunks: Vec<SemanticChunk> = Vec::new();
1346        let mut file_metadata: HashMap<PathBuf, IndexedFileMetadata> = HashMap::new();
1347
1348        for (file, result) in per_file {
1349            match result {
1350                Ok((metadata, file_chunks)) => {
1351                    file_metadata.insert(file, metadata);
1352                    chunks.extend(file_chunks);
1353                }
1354                Err(error) => {
1355                    // "unsupported file extension" is expected for non-code files
1356                    // (json, xml, .gitignore, etc.) that get included in the
1357                    // project walk. Pre-fix this was swallowed by .unwrap_or_default();
1358                    // we now skip silently to keep the log clean. Only real read/parse
1359                    // errors are worth surfacing.
1360                    if error == "unsupported file extension" {
1361                        continue;
1362                    }
1363                    slog_warn!(
1364                        "failed to collect semantic chunks for {}: {}",
1365                        file.display(),
1366                        error
1367                    );
1368                }
1369            }
1370        }
1371
1372        slog_info!(
1373            "semantic collect: {} chunks from {} files in {} ms",
1374            chunks.len(),
1375            file_metadata.len(),
1376            collect_started.elapsed().as_millis()
1377        );
1378
1379        (chunks, file_metadata)
1380    }
1381
1382    fn build_chunk_reuse_map(&self, files: &[PathBuf]) -> ChunkReuseMap {
1383        let requested: HashSet<&Path> = files.iter().map(PathBuf::as_path).collect();
1384        let mut reuse_map: ChunkReuseMap = HashMap::new();
1385
1386        for entry in &self.entries {
1387            if !requested.contains(entry.chunk.file.as_path()) {
1388                continue;
1389            }
1390
1391            // `embed_text` is already persisted in the current on-disk format,
1392            // so refresh-time reuse can hash it in memory and confirm the exact
1393            // string without bumping `SEMANTIC_INDEX_VERSION` and forcing every
1394            // user through a full rebuild.
1395            let hash = blake3::hash(entry.chunk.embed_text.as_bytes());
1396            reuse_map
1397                .entry(entry.chunk.file.clone())
1398                .or_default()
1399                .entry(hash)
1400                .or_default()
1401                .push(ReusableEmbedding {
1402                    embed_text: entry.chunk.embed_text.clone(),
1403                    vector: entry.vector.clone(),
1404                });
1405        }
1406
1407        reuse_map
1408    }
1409
1410    fn reusable_vector_for_chunk(
1411        reuse_map: &ChunkReuseMap,
1412        chunk: &SemanticChunk,
1413    ) -> Option<Vec<f32>> {
1414        let hash = blake3::hash(chunk.embed_text.as_bytes());
1415        reuse_map
1416            .get(&chunk.file)?
1417            .get(&hash)?
1418            .iter()
1419            .find(|candidate| candidate.embed_text == chunk.embed_text)
1420            .map(|candidate| candidate.vector.clone())
1421    }
1422
1423    fn entries_for_chunks_with_reuse<F, P>(
1424        chunks: Vec<SemanticChunk>,
1425        reuse_map: &ChunkReuseMap,
1426        embed_fn: &mut F,
1427        max_batch_size: usize,
1428        initial_observed_dimension: Option<usize>,
1429        refresh_label: &str,
1430        progress: &mut P,
1431    ) -> Result<(Vec<EmbeddingEntry>, Option<usize>), String>
1432    where
1433        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1434        P: FnMut(usize, usize),
1435    {
1436        let total_chunks = chunks.len();
1437        progress(0, total_chunks);
1438
1439        let mut entries_by_chunk: Vec<Option<EmbeddingEntry>> = vec![None; total_chunks];
1440        let mut misses: Vec<(usize, SemanticChunk)> = Vec::new();
1441
1442        for (chunk_index, chunk) in chunks.into_iter().enumerate() {
1443            if let Some(vector) = Self::reusable_vector_for_chunk(reuse_map, &chunk) {
1444                entries_by_chunk[chunk_index] = Some(EmbeddingEntry { chunk, vector });
1445            } else {
1446                misses.push((chunk_index, chunk));
1447            }
1448        }
1449
1450        let mut completed = total_chunks.saturating_sub(misses.len());
1451        if completed > 0 {
1452            progress(completed, total_chunks);
1453        }
1454
1455        let batch_size = max_batch_size.max(1);
1456        let mut observed_dimension = initial_observed_dimension;
1457
1458        for batch_start in (0..misses.len()).step_by(batch_size) {
1459            let batch_end = (batch_start + batch_size).min(misses.len());
1460            let batch_texts: Vec<String> = misses[batch_start..batch_end]
1461                .iter()
1462                .map(|(_, chunk)| chunk.embed_text.clone())
1463                .collect();
1464
1465            let vectors = embed_fn(batch_texts)?;
1466            validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1467
1468            if let Some(dim) = vectors.first().map(|vector| vector.len()) {
1469                match observed_dimension {
1470                    None => observed_dimension = Some(dim),
1471                    Some(expected) if dim != expected => {
1472                        return Err(format!(
1473                            "embedding dimension changed during {refresh_label}: \
1474                             cached index uses {expected}, new vectors use {dim}"
1475                        ));
1476                    }
1477                    _ => {}
1478                }
1479            }
1480
1481            for (i, vector) in vectors.into_iter().enumerate() {
1482                let (chunk_index, chunk) = misses[batch_start + i].clone();
1483                entries_by_chunk[chunk_index] = Some(EmbeddingEntry { chunk, vector });
1484            }
1485
1486            completed += batch_end - batch_start;
1487            progress(completed, total_chunks);
1488        }
1489
1490        let entries = entries_by_chunk
1491            .into_iter()
1492            .map(|entry| entry.expect("semantic refresh accounted for every chunk"))
1493            .collect();
1494
1495        Ok((entries, observed_dimension))
1496    }
1497
1498    fn build_from_chunks<F, P>(
1499        project_root: &Path,
1500        chunks: Vec<SemanticChunk>,
1501        file_metadata: HashMap<PathBuf, IndexedFileMetadata>,
1502        embed_fn: &mut F,
1503        max_batch_size: usize,
1504        mut progress: Option<&mut P>,
1505    ) -> Result<Self, String>
1506    where
1507        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1508        P: FnMut(usize, usize),
1509    {
1510        debug_assert!(project_root.is_absolute());
1511        let total_chunks = chunks.len();
1512
1513        if chunks.is_empty() {
1514            return Ok(Self {
1515                entries: Vec::new(),
1516                file_mtimes: file_metadata
1517                    .iter()
1518                    .map(|(path, metadata)| (path.clone(), metadata.mtime))
1519                    .collect(),
1520                file_sizes: file_metadata
1521                    .iter()
1522                    .map(|(path, metadata)| (path.clone(), metadata.size))
1523                    .collect(),
1524                file_hashes: file_metadata
1525                    .into_iter()
1526                    .map(|(path, metadata)| (path, metadata.content_hash))
1527                    .collect(),
1528                dimension: DEFAULT_DIMENSION,
1529                fingerprint: None,
1530                project_root: project_root.to_path_buf(),
1531                deferred_files: HashSet::new(),
1532            });
1533        }
1534
1535        // Embed in batches
1536        let mut entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1537        let mut expected_dimension: Option<usize> = None;
1538        let batch_size = max_batch_size.max(1);
1539        let embed_started = std::time::Instant::now();
1540        let batch_count = total_chunks.div_ceil(batch_size);
1541        for batch_start in (0..chunks.len()).step_by(batch_size) {
1542            let batch_end = (batch_start + batch_size).min(chunks.len());
1543            let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1544                .iter()
1545                .map(|c| c.embed_text.clone())
1546                .collect();
1547
1548            let vectors = embed_fn(batch_texts)?;
1549            validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1550
1551            // Track consistent dimension across all batches
1552            if let Some(dim) = vectors.first().map(|v| v.len()) {
1553                match expected_dimension {
1554                    None => expected_dimension = Some(dim),
1555                    Some(expected) if dim != expected => {
1556                        return Err(format!(
1557                            "embedding dimension changed across batches: expected {expected}, got {dim}"
1558                        ));
1559                    }
1560                    _ => {}
1561                }
1562            }
1563
1564            for (i, vector) in vectors.into_iter().enumerate() {
1565                let chunk_idx = batch_start + i;
1566                entries.push(EmbeddingEntry {
1567                    chunk: chunks[chunk_idx].clone(),
1568                    vector,
1569                });
1570            }
1571
1572            if let Some(callback) = progress.as_mut() {
1573                callback(entries.len(), total_chunks);
1574            }
1575        }
1576
1577        let embed_ms = embed_started.elapsed().as_millis();
1578        let rate = (total_chunks as u128 * 1000)
1579            .checked_div(embed_ms)
1580            .unwrap_or(0) as u64;
1581        slog_info!(
1582            "semantic embed: {} chunks in {} batches, {} ms ({} chunks/s)",
1583            total_chunks,
1584            batch_count,
1585            embed_ms,
1586            rate
1587        );
1588
1589        let dimension = entries
1590            .first()
1591            .map(|e| e.vector.len())
1592            .unwrap_or(DEFAULT_DIMENSION);
1593
1594        Ok(Self {
1595            entries,
1596            file_mtimes: file_metadata
1597                .iter()
1598                .map(|(path, metadata)| (path.clone(), metadata.mtime))
1599                .collect(),
1600            file_sizes: file_metadata
1601                .iter()
1602                .map(|(path, metadata)| (path.clone(), metadata.size))
1603                .collect(),
1604            file_hashes: file_metadata
1605                .into_iter()
1606                .map(|(path, metadata)| (path, metadata.content_hash))
1607                .collect(),
1608            dimension,
1609            fingerprint: None,
1610            project_root: project_root.to_path_buf(),
1611            deferred_files: HashSet::new(),
1612        })
1613    }
1614
1615    /// Build the semantic index from a set of files using the provided embedding function.
1616    /// `embed_fn` takes a batch of texts and returns a batch of embedding vectors.
1617    pub fn build<F>(
1618        project_root: &Path,
1619        files: &[PathBuf],
1620        embed_fn: &mut F,
1621        max_batch_size: usize,
1622    ) -> Result<Self, String>
1623    where
1624        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1625    {
1626        let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1627        Self::build_from_chunks(
1628            project_root,
1629            chunks,
1630            file_mtimes,
1631            embed_fn,
1632            max_batch_size,
1633            Option::<&mut fn(usize, usize)>::None,
1634        )
1635    }
1636
1637    /// Build the semantic index and report embedding progress using entry counts.
1638    pub fn build_with_progress<F, P>(
1639        project_root: &Path,
1640        files: &[PathBuf],
1641        embed_fn: &mut F,
1642        max_batch_size: usize,
1643        progress: &mut P,
1644    ) -> Result<Self, String>
1645    where
1646        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1647        P: FnMut(usize, usize),
1648    {
1649        let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1650        let total_chunks = chunks.len();
1651        progress(0, total_chunks);
1652        Self::build_from_chunks(
1653            project_root,
1654            chunks,
1655            file_mtimes,
1656            embed_fn,
1657            max_batch_size,
1658            Some(progress),
1659        )
1660    }
1661
1662    /// Incrementally refresh entries for changed/new files only, preserving cached
1663    /// embeddings for unchanged files. Used when loading the index from disk and
1664    /// finding that a small fraction of files have moved on, deleted, or appeared.
1665    ///
1666    /// Returns `RefreshSummary` describing what changed. On success, `self` is
1667    /// mutated in place and remains a valid index.
1668    ///
1669    /// `current_files` is the full set of files the project considers indexable
1670    /// (typically `walk_project_files(...)`). Files in the cache that are no
1671    /// longer in this set are treated as deleted.
1672    pub fn refresh_stale_files<F, P>(
1673        &mut self,
1674        project_root: &Path,
1675        current_files: &[PathBuf],
1676        embed_fn: &mut F,
1677        max_batch_size: usize,
1678        progress: &mut P,
1679    ) -> Result<RefreshSummary, String>
1680    where
1681        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1682        P: FnMut(usize, usize),
1683    {
1684        self.backfill_missing_file_sizes();
1685
1686        // 1. Bucket files into deleted / changed / added.
1687        let current_set: HashSet<&Path> = current_files.iter().map(PathBuf::as_path).collect();
1688        self.deferred_files
1689            .retain(|path| current_set.contains(path.as_path()));
1690        let total_processed = current_set.len() + self.file_mtimes.len()
1691            - self
1692                .file_mtimes
1693                .keys()
1694                .filter(|path| current_set.contains(path.as_path()))
1695                .count();
1696
1697        // Files in cache that disappeared from disk OR are no longer in the
1698        // walked set. Both cases need their entries dropped.
1699        enum IndexedFileCheck {
1700            Deleted(PathBuf),
1701            MissingMetadata(PathBuf),
1702            Verified(PathBuf, FreshnessVerdict),
1703        }
1704
1705        let mut deleted: Vec<PathBuf> = Vec::new();
1706        let mut changed: Vec<PathBuf> = Vec::new();
1707        let indexed_paths: Vec<PathBuf> = self.file_mtimes.keys().cloned().collect();
1708        let mut checks: Vec<Option<IndexedFileCheck>> = Vec::with_capacity(indexed_paths.len());
1709        let mut strict_verify_inputs: Vec<(usize, PathBuf, FileFreshness)> = Vec::new();
1710
1711        for indexed_path in indexed_paths {
1712            let check_index = checks.len();
1713            if !current_set.contains(indexed_path.as_path()) {
1714                checks.push(Some(IndexedFileCheck::Deleted(indexed_path)));
1715                continue;
1716            }
1717            let cached = match (
1718                self.file_mtimes.get(&indexed_path),
1719                self.file_sizes.get(&indexed_path),
1720                self.file_hashes.get(&indexed_path),
1721            ) {
1722                (Some(mtime), Some(size), Some(hash)) => Some(FileFreshness {
1723                    mtime: *mtime,
1724                    size: *size,
1725                    content_hash: *hash,
1726                }),
1727                _ => None,
1728            };
1729            if let Some(freshness) = cached {
1730                strict_verify_inputs.push((check_index, indexed_path, freshness));
1731                checks.push(None);
1732            } else {
1733                checks.push(Some(IndexedFileCheck::MissingMetadata(indexed_path)));
1734            }
1735        }
1736
1737        for (check_index, path, verdict) in
1738            cache_freshness::verify_files_strict_bounded(strict_verify_inputs)
1739        {
1740            checks[check_index] = Some(IndexedFileCheck::Verified(path, verdict));
1741        }
1742
1743        for check in checks {
1744            match check.expect("strict freshness check should be populated") {
1745                IndexedFileCheck::Deleted(path) => deleted.push(path),
1746                IndexedFileCheck::MissingMetadata(path) => changed.push(path),
1747                IndexedFileCheck::Verified(_path, FreshnessVerdict::HotFresh) => {}
1748                IndexedFileCheck::Verified(
1749                    path,
1750                    FreshnessVerdict::ContentFresh {
1751                        new_mtime,
1752                        new_size,
1753                    },
1754                ) => {
1755                    self.file_mtimes.insert(path.clone(), new_mtime);
1756                    self.file_sizes.insert(path, new_size);
1757                }
1758                IndexedFileCheck::Verified(
1759                    path,
1760                    FreshnessVerdict::Stale | FreshnessVerdict::Deleted,
1761                ) => {
1762                    changed.push(path);
1763                }
1764            }
1765        }
1766
1767        // Files in walk that were never indexed.
1768        let mut added: Vec<PathBuf> = Vec::new();
1769        for path in current_files {
1770            if !self.file_mtimes.contains_key(path) {
1771                added.push(path.clone());
1772            }
1773        }
1774
1775        // Fast path: nothing to do.
1776        if deleted.is_empty() && changed.is_empty() && added.is_empty() {
1777            progress(0, 0);
1778            return Ok(RefreshSummary {
1779                total_processed,
1780                ..RefreshSummary::default()
1781            });
1782        }
1783
1784        // 2. Drop entries for deleted files immediately. Changed files are only
1785        //    replaced after successful re-extraction + embedding so transient
1786        //    read/parse errors keep the stale-but-valid cache entry.
1787        if !deleted.is_empty() {
1788            self.remove_indexed_files(&deleted);
1789        }
1790
1791        // 3. Embed the changed + added set, if any.
1792        let mut to_embed: Vec<PathBuf> = Vec::with_capacity(changed.len() + added.len());
1793        to_embed.extend(changed.iter().cloned());
1794        to_embed.extend(added.iter().cloned());
1795
1796        if to_embed.is_empty() {
1797            // Only deletions happened.
1798            progress(0, 0);
1799            return Ok(RefreshSummary {
1800                changed: 0,
1801                added: 0,
1802                deleted: deleted.len(),
1803                total_processed,
1804            });
1805        }
1806
1807        let reuse_map = self.build_chunk_reuse_map(&changed);
1808        let (chunks, fresh_metadata) = Self::collect_chunks(project_root, &to_embed);
1809        let changed_set: HashSet<&Path> = changed.iter().map(PathBuf::as_path).collect();
1810        let vanished = to_embed
1811            .iter()
1812            .filter(|path| {
1813                changed_set.contains(path.as_path())
1814                    && !fresh_metadata.contains_key(*path)
1815                    && !path.exists()
1816            })
1817            .cloned()
1818            .collect::<Vec<_>>();
1819        if !vanished.is_empty() {
1820            self.remove_indexed_files(&vanished);
1821            deleted.extend(vanished);
1822        }
1823
1824        if chunks.is_empty() {
1825            progress(0, 0);
1826            let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1827            for file in &successful_files {
1828                self.deferred_files.remove(file);
1829            }
1830            if !successful_files.is_empty() {
1831                self.entries
1832                    .retain(|entry| !successful_files.contains(&entry.chunk.file));
1833            }
1834            let changed_count = changed
1835                .iter()
1836                .filter(|path| successful_files.contains(*path))
1837                .count();
1838            let added_count = added
1839                .iter()
1840                .filter(|path| successful_files.contains(*path))
1841                .count();
1842            for (file, metadata) in fresh_metadata {
1843                self.file_mtimes.insert(file.clone(), metadata.mtime);
1844                self.file_sizes.insert(file.clone(), metadata.size);
1845                self.file_hashes.insert(file.clone(), metadata.content_hash);
1846            }
1847            return Ok(RefreshSummary {
1848                changed: changed_count,
1849                added: added_count,
1850                deleted: deleted.len(),
1851                total_processed,
1852            });
1853        }
1854
1855        // 4. Build the full replacement set, reusing cached vectors for chunks
1856        //    whose embed_text is unchanged and embedding only cache misses.
1857        let existing_dimension = if self.entries.is_empty() {
1858            None
1859        } else {
1860            Some(self.dimension)
1861        };
1862        let (new_entries, observed_dimension) = Self::entries_for_chunks_with_reuse(
1863            chunks,
1864            &reuse_map,
1865            embed_fn,
1866            max_batch_size,
1867            existing_dimension,
1868            "incremental refresh",
1869            progress,
1870        )?;
1871
1872        let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1873        for file in &successful_files {
1874            self.deferred_files.remove(file);
1875        }
1876        if !successful_files.is_empty() {
1877            self.entries
1878                .retain(|entry| !successful_files.contains(&entry.chunk.file));
1879        }
1880
1881        self.entries.extend(new_entries);
1882        for (file, metadata) in fresh_metadata {
1883            self.file_mtimes.insert(file.clone(), metadata.mtime);
1884            self.file_sizes.insert(file.clone(), metadata.size);
1885            self.file_hashes.insert(file, metadata.content_hash);
1886        }
1887        if let Some(dim) = observed_dimension {
1888            self.dimension = dim;
1889        }
1890
1891        Ok(RefreshSummary {
1892            changed: changed
1893                .iter()
1894                .filter(|path| successful_files.contains(*path))
1895                .count(),
1896            added: added
1897                .iter()
1898                .filter(|path| successful_files.contains(*path))
1899                .count(),
1900            deleted: deleted.len(),
1901            total_processed,
1902        })
1903    }
1904
1905    /// Refresh exactly the files invalidated by the live watcher, without
1906    /// treating the provided path list as the whole project. This is the
1907    /// watcher-side counterpart to `refresh_stale_files`: it drops any stale
1908    /// entries for the requested paths from this in-memory index, re-extracts
1909    /// whatever still exists on disk, embeds those chunks, and returns the
1910    /// delta needed for another in-memory index to apply the same update.
1911    pub fn refresh_invalidated_files<F, P>(
1912        &mut self,
1913        project_root: &Path,
1914        paths: &[PathBuf],
1915        embed_fn: &mut F,
1916        max_batch_size: usize,
1917        max_files: usize,
1918        progress: &mut P,
1919    ) -> Result<InvalidatedFilesRefresh, String>
1920    where
1921        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1922        P: FnMut(usize, usize),
1923    {
1924        self.backfill_missing_file_sizes();
1925
1926        self.deferred_files.retain(|path| path.exists());
1927        let mut requested_paths = paths.to_vec();
1928        requested_paths.extend(self.deferred_files.iter().cloned());
1929        requested_paths.sort();
1930        requested_paths.dedup();
1931        let total_processed = requested_paths.len();
1932
1933        if requested_paths.is_empty() {
1934            progress(0, 0);
1935            return Ok(InvalidatedFilesRefresh {
1936                summary: RefreshSummary {
1937                    total_processed,
1938                    ..RefreshSummary::default()
1939                },
1940                ..InvalidatedFilesRefresh::default()
1941            });
1942        }
1943
1944        let previously_indexed: HashSet<PathBuf> = requested_paths
1945            .iter()
1946            .filter(|path| self.file_mtimes.contains_key(*path))
1947            .cloned()
1948            .collect();
1949        let reuse_map = self.build_chunk_reuse_map(&requested_paths);
1950
1951        // The watcher path has already invalidated these files in the request
1952        // thread's live index. Mirror that behavior here before inserting any
1953        // fresh chunks so parse/read failures do not resurrect stale entries.
1954        self.remove_indexed_files(&requested_paths);
1955
1956        let existing_paths = requested_paths
1957            .iter()
1958            .filter(|path| path.exists())
1959            .cloned()
1960            .collect::<Vec<_>>();
1961        let deleted = requested_paths
1962            .iter()
1963            .filter(|path| !path.exists() && previously_indexed.contains(path.as_path()))
1964            .count();
1965
1966        if existing_paths.is_empty() {
1967            for path in &requested_paths {
1968                if !path.exists() {
1969                    self.deferred_files.remove(path);
1970                }
1971            }
1972            progress(0, 0);
1973            return Ok(InvalidatedFilesRefresh {
1974                completed_paths: requested_paths,
1975                summary: RefreshSummary {
1976                    deleted,
1977                    total_processed,
1978                    ..RefreshSummary::default()
1979                },
1980                ..InvalidatedFilesRefresh::default()
1981            });
1982        }
1983
1984        let (mut chunks, mut fresh_metadata) = Self::collect_chunks(project_root, &existing_paths);
1985
1986        let retained_file_count = self.file_mtimes.len();
1987        let changed_successful_count = existing_paths
1988            .iter()
1989            .filter(|path| {
1990                previously_indexed.contains(path.as_path()) && fresh_metadata.contains_key(*path)
1991            })
1992            .count();
1993        let available_new_files =
1994            max_files.saturating_sub(retained_file_count.saturating_add(changed_successful_count));
1995        let new_successful_files = existing_paths
1996            .iter()
1997            .filter(|path| {
1998                !previously_indexed.contains(path.as_path()) && fresh_metadata.contains_key(*path)
1999            })
2000            .cloned()
2001            .collect::<Vec<_>>();
2002        if new_successful_files.len() > available_new_files {
2003            let allowed_new_files = new_successful_files
2004                .iter()
2005                .take(available_new_files)
2006                .cloned()
2007                .collect::<HashSet<_>>();
2008            let deferred_new_files = new_successful_files
2009                .into_iter()
2010                .filter(|path| !allowed_new_files.contains(path))
2011                .collect::<HashSet<_>>();
2012
2013            fresh_metadata.retain(|file, _| {
2014                previously_indexed.contains(file.as_path()) || allowed_new_files.contains(file)
2015            });
2016            chunks.retain(|chunk| !deferred_new_files.contains(&chunk.file));
2017
2018            if !deferred_new_files.is_empty() {
2019                for path in &deferred_new_files {
2020                    self.deferred_files.insert(path.clone());
2021                }
2022                slog_warn!(
2023                    "semantic refresh deferred {} new file(s): indexed-file cap {} is reached",
2024                    deferred_new_files.len(),
2025                    max_files
2026                );
2027            }
2028        }
2029
2030        let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
2031        for file in &successful_files {
2032            self.deferred_files.remove(file);
2033        }
2034        let changed = successful_files
2035            .iter()
2036            .filter(|path| previously_indexed.contains(path.as_path()))
2037            .count();
2038        let added = successful_files.len().saturating_sub(changed);
2039        let mut updated_metadata = Vec::with_capacity(fresh_metadata.len());
2040
2041        if chunks.is_empty() {
2042            progress(0, 0);
2043            for (file, metadata) in fresh_metadata {
2044                let freshness = FileFreshness {
2045                    mtime: metadata.mtime,
2046                    size: metadata.size,
2047                    content_hash: metadata.content_hash,
2048                };
2049                self.file_mtimes.insert(file.clone(), freshness.mtime);
2050                self.file_sizes.insert(file.clone(), freshness.size);
2051                self.file_hashes
2052                    .insert(file.clone(), freshness.content_hash);
2053                updated_metadata.push((file, freshness));
2054            }
2055
2056            return Ok(InvalidatedFilesRefresh {
2057                updated_metadata,
2058                completed_paths: requested_paths,
2059                summary: RefreshSummary {
2060                    changed,
2061                    added,
2062                    deleted,
2063                    total_processed,
2064                },
2065                ..InvalidatedFilesRefresh::default()
2066            });
2067        }
2068
2069        let initial_observed_dimension = if self.entries.is_empty() && previously_indexed.is_empty()
2070        {
2071            None
2072        } else {
2073            Some(self.dimension)
2074        };
2075        let (new_entries, observed_dimension) = Self::entries_for_chunks_with_reuse(
2076            chunks,
2077            &reuse_map,
2078            embed_fn,
2079            max_batch_size,
2080            initial_observed_dimension,
2081            "invalidated-file refresh",
2082            progress,
2083        )?;
2084
2085        let added_entries = new_entries.clone();
2086        self.entries.extend(new_entries);
2087        for (file, metadata) in fresh_metadata {
2088            let freshness = FileFreshness {
2089                mtime: metadata.mtime,
2090                size: metadata.size,
2091                content_hash: metadata.content_hash,
2092            };
2093            self.file_mtimes.insert(file.clone(), freshness.mtime);
2094            self.file_sizes.insert(file.clone(), freshness.size);
2095            self.file_hashes
2096                .insert(file.clone(), freshness.content_hash);
2097            updated_metadata.push((file, freshness));
2098        }
2099        if let Some(dim) = observed_dimension {
2100            self.dimension = dim;
2101        }
2102
2103        Ok(InvalidatedFilesRefresh {
2104            added_entries,
2105            updated_metadata,
2106            completed_paths: requested_paths,
2107            summary: RefreshSummary {
2108                changed,
2109                added,
2110                deleted,
2111                total_processed,
2112            },
2113        })
2114    }
2115
2116    pub fn apply_refresh_update(
2117        &mut self,
2118        added_entries: Vec<EmbeddingEntry>,
2119        updated_metadata: Vec<(PathBuf, FileFreshness)>,
2120        completed_paths: &[PathBuf],
2121    ) {
2122        // `added_entries` is the complete replacement set for completed paths:
2123        // freshly embedded misses plus reused chunks carrying refreshed metadata.
2124        // Removing first is safe only because producers include both kinds.
2125        self.remove_indexed_files(completed_paths);
2126
2127        let observed_dimension = added_entries.first().map(|entry| entry.vector.len());
2128        self.entries.extend(added_entries);
2129        for (file, freshness) in updated_metadata {
2130            self.file_mtimes.insert(file.clone(), freshness.mtime);
2131            self.file_sizes.insert(file.clone(), freshness.size);
2132            self.file_hashes.insert(file, freshness.content_hash);
2133        }
2134        if let Some(dim) = observed_dimension {
2135            self.dimension = dim;
2136        }
2137    }
2138
2139    fn remove_indexed_files(&mut self, files: &[PathBuf]) {
2140        let deleted_set: HashSet<&Path> = files.iter().map(PathBuf::as_path).collect();
2141        self.entries
2142            .retain(|entry| !deleted_set.contains(entry.chunk.file.as_path()));
2143        for path in files {
2144            self.file_mtimes.remove(path);
2145            self.file_sizes.remove(path);
2146            self.file_hashes.remove(path);
2147        }
2148    }
2149
2150    /// Search the index with a query embedding, returning top-K results sorted by relevance
2151    pub fn search(&self, query_vector: &[f32], top_k: usize) -> Vec<SemanticResult> {
2152        if self.entries.is_empty() || query_vector.len() != self.dimension {
2153            return Vec::new();
2154        }
2155
2156        let mut scored: Vec<(f32, usize)> = self
2157            .entries
2158            .iter()
2159            .enumerate()
2160            .map(|(i, entry)| {
2161                let mut score = cosine_similarity(query_vector, &entry.vector);
2162                if entry.chunk.exported {
2163                    score *= 1.1;
2164                }
2165                (score, i)
2166            })
2167            .collect();
2168
2169        let keep = top_k.min(scored.len());
2170        if keep == 0 {
2171            return Vec::new();
2172        }
2173
2174        if keep < scored.len() {
2175            scored.select_nth_unstable_by(keep, semantic_score_order);
2176            scored.truncate(keep);
2177        }
2178        scored.sort_by(semantic_score_order);
2179
2180        scored
2181            .into_iter()
2182            // Keep the selected best-first slice mapped without reintroducing the
2183            // old `> 0.0` floor: top_k has already been selected, and zero-score
2184            // tail entries remain observable when requested.
2185            .map(|(score, idx)| {
2186                let entry = &self.entries[idx];
2187                SemanticResult {
2188                    file: entry.chunk.file.clone(),
2189                    name: entry.chunk.name.clone(),
2190                    kind: entry.chunk.kind.clone(),
2191                    start_line: entry.chunk.start_line,
2192                    end_line: entry.chunk.end_line,
2193                    exported: entry.chunk.exported,
2194                    snippet: entry.chunk.snippet.clone(),
2195                    score,
2196                    source: "semantic",
2197                }
2198            })
2199            .collect()
2200    }
2201
2202    /// Number of indexed entries
2203    pub fn len(&self) -> usize {
2204        self.entries.len()
2205    }
2206
2207    /// Check if a file needs re-indexing based on mtime/size
2208    pub fn is_file_stale(&self, file: &Path) -> bool {
2209        let Some(stored_mtime) = self.file_mtimes.get(file) else {
2210            return true;
2211        };
2212        let Some(stored_size) = self.file_sizes.get(file) else {
2213            return true;
2214        };
2215        let Some(stored_hash) = self.file_hashes.get(file) else {
2216            return true;
2217        };
2218        let cached = FileFreshness {
2219            mtime: *stored_mtime,
2220            size: *stored_size,
2221            content_hash: *stored_hash,
2222        };
2223        match cache_freshness::verify_file_strict(file, &cached) {
2224            FreshnessVerdict::HotFresh => false,
2225            FreshnessVerdict::ContentFresh { .. } => false,
2226            FreshnessVerdict::Stale | FreshnessVerdict::Deleted => true,
2227        }
2228    }
2229
2230    fn backfill_missing_file_sizes(&mut self) {
2231        for path in self.file_mtimes.keys() {
2232            if self.file_sizes.contains_key(path) {
2233                continue;
2234            }
2235            if let Ok(metadata) = fs::metadata(path) {
2236                self.file_sizes.insert(path.clone(), metadata.len());
2237                if let Ok(Some(hash)) = cache_freshness::hash_file_if_small(path, metadata.len()) {
2238                    self.file_hashes.insert(path.clone(), hash);
2239                }
2240            }
2241        }
2242    }
2243
2244    /// Remove entries for a specific file
2245    pub fn remove_file(&mut self, file: &Path) {
2246        self.invalidate_file(file);
2247    }
2248
2249    pub fn invalidate_file(&mut self, file: &Path) {
2250        let canonical_file = canonicalize_existing_or_deleted_path(file);
2251        self.entries
2252            .retain(|e| e.chunk.file != file && e.chunk.file != canonical_file);
2253        self.file_mtimes.remove(file);
2254        self.file_sizes.remove(file);
2255        self.file_hashes.remove(file);
2256        if canonical_file.as_path() != file {
2257            self.file_mtimes.remove(&canonical_file);
2258            self.file_sizes.remove(&canonical_file);
2259            self.file_hashes.remove(&canonical_file);
2260        }
2261    }
2262
2263    /// Get the embedding dimension
2264    pub fn dimension(&self) -> usize {
2265        self.dimension
2266    }
2267
2268    pub fn fingerprint(&self) -> Option<&SemanticIndexFingerprint> {
2269        self.fingerprint.as_ref()
2270    }
2271
2272    pub fn backend_label(&self) -> Option<&str> {
2273        self.fingerprint.as_ref().map(|f| f.backend.as_str())
2274    }
2275
2276    pub fn model_label(&self) -> Option<&str> {
2277        self.fingerprint.as_ref().map(|f| f.model.as_str())
2278    }
2279
2280    pub fn set_fingerprint(&mut self, fingerprint: SemanticIndexFingerprint) {
2281        self.fingerprint = Some(fingerprint);
2282    }
2283
2284    /// Write the semantic index to disk using atomic temp+rename pattern
2285    pub fn write_to_disk(&self, storage_dir: &Path, project_key: &str) {
2286        // Don't persist empty indexes — they would be loaded on next startup
2287        // and prevent a fresh build that might find files.
2288        if self.entries.is_empty() {
2289            slog_info!("skipping semantic index persistence (0 entries)");
2290            return;
2291        }
2292        let dir = storage_dir.join("semantic").join(project_key);
2293        if let Err(e) = fs::create_dir_all(&dir) {
2294            slog_warn!("failed to create semantic cache dir: {}", e);
2295            return;
2296        }
2297        let data_path = dir.join("semantic.bin");
2298        let tmp_path = dir.join(format!(
2299            "semantic.bin.tmp.{}.{}",
2300            std::process::id(),
2301            SystemTime::now()
2302                .duration_since(SystemTime::UNIX_EPOCH)
2303                .unwrap_or(Duration::ZERO)
2304                .as_nanos()
2305        ));
2306        let write_result = (|| -> io::Result<usize> {
2307            let file = fs::File::create(&tmp_path)?;
2308            let mut writer = BufWriter::new(file);
2309            let bytes_written = self.write_to_writer(&mut writer)?;
2310            writer.flush()?;
2311            writer.get_ref().sync_all()?;
2312            Ok(bytes_written)
2313        })();
2314        let bytes_written = match write_result {
2315            Ok(bytes_written) => bytes_written,
2316            Err(e) => {
2317                slog_warn!("failed to write semantic index: {}", e);
2318                let _ = fs::remove_file(&tmp_path);
2319                return;
2320            }
2321        };
2322        if let Err(e) = fs::rename(&tmp_path, &data_path) {
2323            slog_warn!("failed to rename semantic index: {}", e);
2324            let _ = fs::remove_file(&tmp_path);
2325            return;
2326        }
2327        slog_info!(
2328            "semantic index persisted: {} entries, {:.1} KB",
2329            self.entries.len(),
2330            bytes_written as f64 / 1024.0
2331        );
2332    }
2333
2334    /// Read the semantic index from disk
2335    pub fn read_from_disk(
2336        storage_dir: &Path,
2337        project_key: &str,
2338        current_canonical_root: &Path,
2339        is_worktree_bridge: bool,
2340        expected_fingerprint: Option<&str>,
2341    ) -> Option<Self> {
2342        debug_assert!(current_canonical_root.is_absolute());
2343        let data_path = storage_dir
2344            .join("semantic")
2345            .join(project_key)
2346            .join("semantic.bin");
2347        let file = fs::File::open(&data_path).ok()?;
2348        let file_len = usize::try_from(file.metadata().ok()?.len()).ok()?;
2349        if file_len < HEADER_BYTES_V1 {
2350            slog_warn!(
2351                "corrupt semantic index (too small: {} bytes), removing",
2352                file_len
2353            );
2354            if !is_worktree_bridge {
2355                let _ = fs::remove_file(&data_path);
2356            }
2357            return None;
2358        }
2359
2360        let mut reader = BufReader::new(file);
2361        let mut version_buf = [0u8; 1];
2362        reader.read_exact(&mut version_buf).ok()?;
2363        let version = version_buf[0];
2364        if version != SEMANTIC_INDEX_VERSION_V6 {
2365            slog_info!(
2366                "cached semantic index version {} is older than {}, rebuilding",
2367                version,
2368                SEMANTIC_INDEX_VERSION_V6
2369            );
2370            if !is_worktree_bridge {
2371                let _ = fs::remove_file(&data_path);
2372            }
2373            return None;
2374        }
2375        match Self::from_reader_after_version(
2376            reader,
2377            version,
2378            current_canonical_root,
2379            Some(file_len),
2380            1,
2381        ) {
2382            Ok(index) => {
2383                if index.entries.is_empty() {
2384                    slog_info!("cached semantic index is empty, will rebuild");
2385                    if !is_worktree_bridge {
2386                        let _ = fs::remove_file(&data_path);
2387                    }
2388                    return None;
2389                }
2390                if let Some(expected) = expected_fingerprint {
2391                    let matches = index
2392                        .fingerprint()
2393                        .map(|fingerprint| fingerprint.matches_expected(expected))
2394                        .unwrap_or(false);
2395                    if !matches {
2396                        slog_info!("cached semantic index fingerprint mismatch, rebuilding");
2397                        if !is_worktree_bridge {
2398                            let _ = fs::remove_file(&data_path);
2399                        }
2400                        return None;
2401                    }
2402                }
2403                slog_info!(
2404                    "loaded semantic index from disk: {} entries",
2405                    index.entries.len()
2406                );
2407                Some(index)
2408            }
2409            Err(e) => {
2410                slog_warn!("corrupt semantic index, rebuilding: {}", e);
2411                if !is_worktree_bridge {
2412                    let _ = fs::remove_file(&data_path);
2413                }
2414                None
2415            }
2416        }
2417    }
2418
2419    /// Serialize the index to bytes for disk persistence
2420    pub fn to_bytes(&self) -> Vec<u8> {
2421        let mut buf = Vec::new();
2422        self.write_to_writer(&mut buf)
2423            .expect("writing semantic index to Vec cannot fail");
2424        buf
2425    }
2426
2427    fn write_to_writer<W: Write>(&self, writer: &mut W) -> io::Result<usize> {
2428        let mut bytes_written = 0usize;
2429        let fingerprint = self.fingerprint.as_ref().and_then(|fingerprint| {
2430            let encoded = fingerprint.as_string();
2431            if encoded.is_empty() {
2432                None
2433            } else {
2434                Some(encoded)
2435            }
2436        });
2437        let fp_bytes_ref = fingerprint.as_deref().map(str::as_bytes).unwrap_or(&[]);
2438        let file_mtime_count = self
2439            .file_mtimes
2440            .iter()
2441            .filter(|(path, _)| cache_relative_path(&self.project_root, path).is_some())
2442            .count();
2443        let entry_count = self
2444            .entries
2445            .iter()
2446            .filter(|entry| cache_relative_path(&self.project_root, &entry.chunk.file).is_some())
2447            .count();
2448
2449        // Header: version(1) + dimension(4) + entry_count(4) + fingerprint_len(4) + fingerprint
2450        //
2451        // V6 is the single write format. Layout extends V5:
2452        //   - fingerprint is always represented (absent ⇒ fingerprint_len=0,
2453        //     no bytes follow). Uniform format simplifies the reader.
2454        //   - paths are relative to project_root.
2455        //   - file metadata stored as secs(u64) + subsec_nanos(u32) + size(u64) + blake3(32).
2456        //     Preserves full APFS/ext4/NTFS precision and catches mtime ties.
2457        //
2458        // V1/V2 remain readable for backward compatibility (see from_bytes).
2459        // V3/V4 load as compatible formats but are rejected on disk so snippets
2460        // and file sizes are rebuilt once.
2461        let version = SEMANTIC_INDEX_VERSION_V6;
2462        write_counted(writer, &[version], &mut bytes_written)?;
2463        write_counted(
2464            writer,
2465            &(self.dimension as u32).to_le_bytes(),
2466            &mut bytes_written,
2467        )?;
2468        write_counted(
2469            writer,
2470            &(entry_count as u32).to_le_bytes(),
2471            &mut bytes_written,
2472        )?;
2473        write_counted(
2474            writer,
2475            &(fp_bytes_ref.len() as u32).to_le_bytes(),
2476            &mut bytes_written,
2477        )?;
2478        write_counted(writer, fp_bytes_ref, &mut bytes_written)?;
2479
2480        // File mtime table: count(4) + entries
2481        // V3 layout per entry: path_len(4) + path + secs(8) + subsec_nanos(4)
2482        write_counted(
2483            writer,
2484            &(file_mtime_count as u32).to_le_bytes(),
2485            &mut bytes_written,
2486        )?;
2487        for (path, mtime) in &self.file_mtimes {
2488            let Some(relative) = cache_relative_path(&self.project_root, path) else {
2489                continue;
2490            };
2491            let relative = relative.to_string_lossy();
2492            let path_bytes = relative.as_bytes();
2493            write_counted(
2494                writer,
2495                &(path_bytes.len() as u32).to_le_bytes(),
2496                &mut bytes_written,
2497            )?;
2498            write_counted(writer, path_bytes, &mut bytes_written)?;
2499            let duration = mtime
2500                .duration_since(SystemTime::UNIX_EPOCH)
2501                .unwrap_or_default();
2502            write_counted(
2503                writer,
2504                &duration.as_secs().to_le_bytes(),
2505                &mut bytes_written,
2506            )?;
2507            write_counted(
2508                writer,
2509                &duration.subsec_nanos().to_le_bytes(),
2510                &mut bytes_written,
2511            )?;
2512            let size = self.file_sizes.get(path).copied().unwrap_or_default();
2513            write_counted(writer, &size.to_le_bytes(), &mut bytes_written)?;
2514            let hash = self
2515                .file_hashes
2516                .get(path)
2517                .copied()
2518                .unwrap_or_else(cache_freshness::zero_hash);
2519            write_counted(writer, hash.as_bytes(), &mut bytes_written)?;
2520        }
2521
2522        // Entries: each is metadata + vector
2523        for entry in &self.entries {
2524            let Some(relative) = cache_relative_path(&self.project_root, &entry.chunk.file) else {
2525                continue;
2526            };
2527            let c = &entry.chunk;
2528
2529            // File path
2530            let relative = relative.to_string_lossy();
2531            let file_bytes = relative.as_bytes();
2532            write_counted(
2533                writer,
2534                &(file_bytes.len() as u32).to_le_bytes(),
2535                &mut bytes_written,
2536            )?;
2537            write_counted(writer, file_bytes, &mut bytes_written)?;
2538
2539            // Name
2540            let name_bytes = c.name.as_bytes();
2541            write_counted(
2542                writer,
2543                &(name_bytes.len() as u32).to_le_bytes(),
2544                &mut bytes_written,
2545            )?;
2546            write_counted(writer, name_bytes, &mut bytes_written)?;
2547
2548            // Kind (1 byte)
2549            write_counted(writer, &[symbol_kind_to_u8(&c.kind)], &mut bytes_written)?;
2550
2551            // Lines + exported
2552            write_counted(
2553                writer,
2554                &(c.start_line as u32).to_le_bytes(),
2555                &mut bytes_written,
2556            )?;
2557            write_counted(
2558                writer,
2559                &(c.end_line as u32).to_le_bytes(),
2560                &mut bytes_written,
2561            )?;
2562            write_counted(writer, &[c.exported as u8], &mut bytes_written)?;
2563
2564            // Snippet
2565            let snippet_bytes = c.snippet.as_bytes();
2566            write_counted(
2567                writer,
2568                &(snippet_bytes.len() as u32).to_le_bytes(),
2569                &mut bytes_written,
2570            )?;
2571            write_counted(writer, snippet_bytes, &mut bytes_written)?;
2572
2573            // Embed text
2574            let embed_bytes = c.embed_text.as_bytes();
2575            write_counted(
2576                writer,
2577                &(embed_bytes.len() as u32).to_le_bytes(),
2578                &mut bytes_written,
2579            )?;
2580            write_counted(writer, embed_bytes, &mut bytes_written)?;
2581
2582            // Vector (f32 array)
2583            for &val in &entry.vector {
2584                write_counted(writer, &val.to_le_bytes(), &mut bytes_written)?;
2585            }
2586        }
2587
2588        Ok(bytes_written)
2589    }
2590
2591    /// Deserialize the index from bytes
2592    pub fn from_bytes(data: &[u8], current_canonical_root: &Path) -> Result<Self, String> {
2593        debug_assert!(current_canonical_root.is_absolute());
2594        if data.len() < HEADER_BYTES_V1 {
2595            return Err("data too short".to_string());
2596        }
2597
2598        Self::from_reader_after_version(
2599            Cursor::new(&data[1..]),
2600            data[0],
2601            current_canonical_root,
2602            Some(data.len()),
2603            1,
2604        )
2605    }
2606
2607    fn from_reader_after_version<R: Read>(
2608        reader: R,
2609        version: u8,
2610        current_canonical_root: &Path,
2611        total_len: Option<usize>,
2612        bytes_read: usize,
2613    ) -> Result<Self, String> {
2614        debug_assert!(current_canonical_root.is_absolute());
2615        let mut reader = CountingReader::with_bytes_read(reader, bytes_read);
2616
2617        if version != SEMANTIC_INDEX_VERSION_V1
2618            && version != SEMANTIC_INDEX_VERSION_V2
2619            && version != SEMANTIC_INDEX_VERSION_V3
2620            && version != SEMANTIC_INDEX_VERSION_V4
2621            && version != SEMANTIC_INDEX_VERSION_V5
2622            && version != SEMANTIC_INDEX_VERSION_V6
2623        {
2624            return Err(format!("unsupported version: {}", version));
2625        }
2626        // V2 and newer share the same header layout (V3/V4/V5 only differ from
2627        // V2 in the per-mtime entry layout): version(1) + dimension(4) +
2628        // entry_count(4) + fingerprint_len(4) + fingerprint bytes.
2629        if (version == SEMANTIC_INDEX_VERSION_V2
2630            || version == SEMANTIC_INDEX_VERSION_V3
2631            || version == SEMANTIC_INDEX_VERSION_V4
2632            || version == SEMANTIC_INDEX_VERSION_V5
2633            || version == SEMANTIC_INDEX_VERSION_V6)
2634            && total_len.is_some_and(|len| len < HEADER_BYTES_V2)
2635        {
2636            return Err("data too short for semantic index v2/v3/v4/v5/v6 header".to_string());
2637        }
2638
2639        let dimension = read_u32_stream(&mut reader)? as usize;
2640        let entry_count = read_u32_stream(&mut reader)? as usize;
2641        validate_embedding_dimension(dimension)?;
2642        if entry_count > MAX_ENTRIES {
2643            return Err(format!("too many semantic index entries: {}", entry_count));
2644        }
2645
2646        // Fingerprint handling:
2647        //   - V1: no fingerprint field at all.
2648        //   - V2: fingerprint_len + fingerprint bytes; always present (writer
2649        //     only emitted V2 when fingerprint was Some).
2650        //   - V3+: fingerprint_len always present; fingerprint_len==0 ⇒ None.
2651        let has_fingerprint_field = version == SEMANTIC_INDEX_VERSION_V2
2652            || version == SEMANTIC_INDEX_VERSION_V3
2653            || version == SEMANTIC_INDEX_VERSION_V4
2654            || version == SEMANTIC_INDEX_VERSION_V5
2655            || version == SEMANTIC_INDEX_VERSION_V6;
2656        let fingerprint = if has_fingerprint_field {
2657            let fingerprint_len = read_u32_stream(&mut reader)? as usize;
2658            if total_len
2659                .is_some_and(|len| reader.bytes_read().saturating_add(fingerprint_len) > len)
2660            {
2661                return Err("unexpected end of data reading fingerprint".to_string());
2662            }
2663            if fingerprint_len == 0 {
2664                None
2665            } else {
2666                let mut raw = vec![0u8; fingerprint_len];
2667                read_exact_stream(
2668                    &mut reader,
2669                    &mut raw,
2670                    "unexpected end of data reading fingerprint",
2671                )?;
2672                let raw = String::from_utf8_lossy(&raw).to_string();
2673                Some(
2674                    serde_json::from_str::<SemanticIndexFingerprint>(&raw)
2675                        .map_err(|error| format!("invalid semantic fingerprint: {error}"))?,
2676                )
2677            }
2678        } else {
2679            None
2680        };
2681
2682        // File mtimes
2683        let mtime_count = read_u32_stream(&mut reader)? as usize;
2684        if mtime_count > MAX_ENTRIES {
2685            return Err(format!("too many semantic file mtimes: {}", mtime_count));
2686        }
2687
2688        let vector_bytes = entry_count
2689            .checked_mul(dimension)
2690            .and_then(|count| count.checked_mul(F32_BYTES))
2691            .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
2692        if total_len.is_some_and(|len| vector_bytes > len.saturating_sub(reader.bytes_read())) {
2693            return Err("semantic index vectors exceed available data".to_string());
2694        }
2695
2696        let mut file_mtimes = HashMap::with_capacity(mtime_count);
2697        let mut file_sizes = HashMap::with_capacity(mtime_count);
2698        let mut file_hashes = HashMap::with_capacity(mtime_count);
2699        for _ in 0..mtime_count {
2700            let path = read_string_stream(&mut reader, total_len)?;
2701            let secs = read_u64_stream(&mut reader)?;
2702            // V3+ persists subsec_nanos alongside secs so staleness checks
2703            // survive restart round-trips. V1/V2 load with 0 nanos, which
2704            // causes one rebuild on upgrade (they never matched live APFS
2705            // mtimes anyway — the bug v0.15.2 fixes). After that rebuild,
2706            // the cache is persisted as V3 and stabilises.
2707            let nanos = if version == SEMANTIC_INDEX_VERSION_V3
2708                || version == SEMANTIC_INDEX_VERSION_V4
2709                || version == SEMANTIC_INDEX_VERSION_V5
2710                || version == SEMANTIC_INDEX_VERSION_V6
2711            {
2712                read_u32_stream(&mut reader)?
2713            } else {
2714                0
2715            };
2716            let size =
2717                if version == SEMANTIC_INDEX_VERSION_V5 || version == SEMANTIC_INDEX_VERSION_V6 {
2718                    read_u64_stream(&mut reader)?
2719                } else {
2720                    0
2721                };
2722            let content_hash = if version == SEMANTIC_INDEX_VERSION_V6 {
2723                let mut hash_bytes = [0u8; 32];
2724                read_exact_stream(
2725                    &mut reader,
2726                    &mut hash_bytes,
2727                    "unexpected end of data reading content hash",
2728                )?;
2729                blake3::Hash::from_bytes(hash_bytes)
2730            } else {
2731                cache_freshness::zero_hash()
2732            };
2733            // Hardening against corrupt / maliciously crafted cache files
2734            // (v0.15.2). `Duration::new(secs, nanos)` can panic when the
2735            // nanosecond carry overflows the second counter, and
2736            // `SystemTime + Duration` can panic on carry past the platform's
2737            // upper bound. Explicit validation keeps a corrupted semantic.bin
2738            // from taking down the whole aft process.
2739            if nanos >= 1_000_000_000 {
2740                return Err(format!(
2741                    "invalid semantic mtime: nanos {} >= 1_000_000_000",
2742                    nanos
2743                ));
2744            }
2745            let duration = std::time::Duration::new(secs, nanos);
2746            let mtime = SystemTime::UNIX_EPOCH
2747                .checked_add(duration)
2748                .ok_or_else(|| {
2749                    format!(
2750                        "invalid semantic mtime: secs={} nanos={} overflows SystemTime",
2751                        secs, nanos
2752                    )
2753                })?;
2754            let path = if version == SEMANTIC_INDEX_VERSION_V6 {
2755                cached_path_under_root(current_canonical_root, &PathBuf::from(path))
2756                    .ok_or_else(|| "cached semantic mtime path escapes project root".to_string())?
2757            } else {
2758                PathBuf::from(path)
2759            };
2760            file_mtimes.insert(path.clone(), mtime);
2761            file_sizes.insert(path.clone(), size);
2762            file_hashes.insert(path, content_hash);
2763        }
2764
2765        // Entries
2766        let mut entries = Vec::with_capacity(entry_count);
2767        for _ in 0..entry_count {
2768            let raw_file = PathBuf::from(read_string_stream(&mut reader, total_len)?);
2769            let file = if version == SEMANTIC_INDEX_VERSION_V6 {
2770                cached_path_under_root(current_canonical_root, &raw_file)
2771                    .ok_or_else(|| "cached semantic entry path escapes project root".to_string())?
2772            } else {
2773                raw_file
2774            };
2775            let name = read_string_stream(&mut reader, total_len)?;
2776
2777            let kind = u8_to_symbol_kind(read_u8_stream(&mut reader, "unexpected end of data")?);
2778
2779            let start_line = read_u32_stream(&mut reader)?;
2780            let end_line = read_u32_stream(&mut reader)?;
2781
2782            let exported = read_u8_stream(&mut reader, "unexpected end of data")? != 0;
2783
2784            let snippet = read_string_stream(&mut reader, total_len)?;
2785            let embed_text = read_string_stream(&mut reader, total_len)?;
2786
2787            // Vector
2788            let vec_bytes = dimension
2789                .checked_mul(F32_BYTES)
2790                .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
2791            if total_len.is_some_and(|len| reader.bytes_read().saturating_add(vec_bytes) > len) {
2792                return Err("unexpected end of data reading vector".to_string());
2793            }
2794            let mut vector = Vec::with_capacity(dimension);
2795            for _ in 0..dimension {
2796                let mut bytes = [0u8; F32_BYTES];
2797                read_exact_stream(
2798                    &mut reader,
2799                    &mut bytes,
2800                    "unexpected end of data reading vector",
2801                )?;
2802                vector.push(f32::from_le_bytes(bytes));
2803            }
2804
2805            entries.push(EmbeddingEntry {
2806                chunk: SemanticChunk {
2807                    file,
2808                    name,
2809                    kind,
2810                    start_line,
2811                    end_line,
2812                    exported,
2813                    embed_text,
2814                    snippet,
2815                },
2816                vector,
2817            });
2818        }
2819
2820        if entries.len() != entry_count {
2821            return Err(format!(
2822                "semantic cache entry count drift: header={} decoded={}",
2823                entry_count,
2824                entries.len()
2825            ));
2826        }
2827        for entry in &entries {
2828            if !file_mtimes.contains_key(&entry.chunk.file) {
2829                return Err(format!(
2830                    "semantic cache metadata missing for entry file {}",
2831                    entry.chunk.file.display()
2832                ));
2833            }
2834        }
2835
2836        Ok(Self {
2837            entries,
2838            file_mtimes,
2839            file_sizes,
2840            file_hashes,
2841            dimension,
2842            fingerprint,
2843            project_root: current_canonical_root.to_path_buf(),
2844            deferred_files: HashSet::new(),
2845        })
2846    }
2847}
2848
2849fn write_counted<W: Write>(
2850    writer: &mut W,
2851    bytes: &[u8],
2852    bytes_written: &mut usize,
2853) -> io::Result<()> {
2854    writer.write_all(bytes)?;
2855    *bytes_written = bytes_written.saturating_add(bytes.len());
2856    Ok(())
2857}
2858
2859struct CountingReader<R> {
2860    inner: R,
2861    bytes_read: usize,
2862}
2863
2864impl<R> CountingReader<R> {
2865    fn with_bytes_read(inner: R, bytes_read: usize) -> Self {
2866        Self { inner, bytes_read }
2867    }
2868
2869    fn bytes_read(&self) -> usize {
2870        self.bytes_read
2871    }
2872}
2873
2874impl<R: Read> Read for CountingReader<R> {
2875    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
2876        let read = self.inner.read(buf)?;
2877        self.bytes_read = self.bytes_read.saturating_add(read);
2878        Ok(read)
2879    }
2880}
2881
2882fn read_exact_stream<R: Read>(
2883    reader: &mut CountingReader<R>,
2884    buf: &mut [u8],
2885    eof_message: &'static str,
2886) -> Result<(), String> {
2887    reader.read_exact(buf).map_err(|error| {
2888        if error.kind() == io::ErrorKind::UnexpectedEof {
2889            eof_message.to_string()
2890        } else {
2891            format!("{eof_message}: {error}")
2892        }
2893    })
2894}
2895
2896fn read_u8_stream<R: Read>(
2897    reader: &mut CountingReader<R>,
2898    eof_message: &'static str,
2899) -> Result<u8, String> {
2900    let mut bytes = [0u8; 1];
2901    read_exact_stream(reader, &mut bytes, eof_message)?;
2902    Ok(bytes[0])
2903}
2904
2905fn read_u32_stream<R: Read>(reader: &mut CountingReader<R>) -> Result<u32, String> {
2906    let mut bytes = [0u8; 4];
2907    read_exact_stream(reader, &mut bytes, "unexpected end of data reading u32")?;
2908    Ok(u32::from_le_bytes(bytes))
2909}
2910
2911fn read_u64_stream<R: Read>(reader: &mut CountingReader<R>) -> Result<u64, String> {
2912    let mut bytes = [0u8; 8];
2913    read_exact_stream(reader, &mut bytes, "unexpected end of data reading u64")?;
2914    Ok(u64::from_le_bytes(bytes))
2915}
2916
2917fn read_string_stream<R: Read>(
2918    reader: &mut CountingReader<R>,
2919    total_len: Option<usize>,
2920) -> Result<String, String> {
2921    let len = read_u32_stream(reader)? as usize;
2922    if total_len.is_some_and(|total_len| reader.bytes_read().saturating_add(len) > total_len) {
2923        return Err("unexpected end of data reading string".to_string());
2924    }
2925    let mut bytes = vec![0u8; len];
2926    read_exact_stream(reader, &mut bytes, "unexpected end of data reading string")?;
2927    Ok(String::from_utf8_lossy(&bytes).to_string())
2928}
2929
2930struct SourceLineCache<'a> {
2931    lines: Vec<&'a str>,
2932    line_starts: Vec<usize>,
2933}
2934
2935impl<'a> SourceLineCache<'a> {
2936    fn new(source: &'a str) -> Self {
2937        let lines: Vec<&'a str> = source.lines().collect();
2938        let mut line_starts = Vec::with_capacity(lines.len());
2939        let bytes = source.as_bytes();
2940        let mut offset = 0usize;
2941        for line in &lines {
2942            line_starts.push(offset);
2943            offset += line.len();
2944            if bytes.get(offset) == Some(&b'\r') && bytes.get(offset + 1) == Some(&b'\n') {
2945                offset += 2;
2946            } else if bytes.get(offset) == Some(&b'\n') {
2947                offset += 1;
2948            }
2949        }
2950        Self { lines, line_starts }
2951    }
2952
2953    fn len(&self) -> usize {
2954        debug_assert_eq!(self.lines.len(), self.line_starts.len());
2955        self.line_starts.len()
2956    }
2957}
2958
2959/// Build enriched embedding text from a symbol with cAST-style context
2960fn build_embed_text_with_lines(
2961    symbol: &Symbol,
2962    line_cache: &SourceLineCache<'_>,
2963    file: &Path,
2964    project_root: &Path,
2965) -> String {
2966    let relative = file
2967        .strip_prefix(project_root)
2968        .unwrap_or(file)
2969        .to_string_lossy();
2970
2971    let kind_label = match &symbol.kind {
2972        SymbolKind::Function => "function",
2973        SymbolKind::Class => "class",
2974        SymbolKind::Method => "method",
2975        SymbolKind::Struct => "struct",
2976        SymbolKind::Interface => "interface",
2977        SymbolKind::Enum => "enum",
2978        SymbolKind::TypeAlias => "type",
2979        SymbolKind::Variable => "variable",
2980        SymbolKind::Heading => "heading",
2981        SymbolKind::FileSummary => "file-summary",
2982    };
2983
2984    // Build: "file:relative/path kind:function name:validateAuth signature:fn validateAuth(token: &str) -> bool"
2985    let name = &symbol.name;
2986    let mut text = format!(
2987        "name:{name} file:{} kind:{} name:{name}",
2988        relative, kind_label
2989    );
2990
2991    if let Some(sig) = &symbol.signature {
2992        // Cap the signature: structured parsers (e.g. YAML/Kubernetes) pack
2993        // entire inline scripts (CronJob/Job `command:` bodies, multi-KB) into
2994        // the signature. Appending it unbounded produces a single embed_text
2995        // that overflows the embedding backend's physical batch (e.g. a
2996        // llama.cpp server's 512-token cap), aborting the whole index build
2997        // and silently degrading every search to lexical. 400 chars keeps the
2998        // identifying head of the signature without blowing the budget.
2999        text.push_str(&format!(" signature:{}", truncate_chars(sig, 400)));
3000    }
3001
3002    // Add body snippet (first ~300 chars of symbol body)
3003    let start = (symbol.range.start_line as usize).min(line_cache.len());
3004    // range.end_line is inclusive 0-based; +1 makes it an exclusive slice bound.
3005    let end = (symbol.range.end_line as usize + 1).min(line_cache.len());
3006    if start < end {
3007        let body: String = line_cache.lines[start..end]
3008            .iter()
3009            .take(15) // max 15 lines
3010            .copied()
3011            .collect::<Vec<&str>>()
3012            .join("\n");
3013        let snippet = if body.len() > 300 {
3014            format!("{}...", &body[..body.floor_char_boundary(300)])
3015        } else {
3016            body
3017        };
3018        text.push_str(&format!(" body:{}", snippet));
3019    }
3020
3021    // Final defense-in-depth clamp: no single embed_text may exceed the
3022    // backend's per-input budget regardless of which field grew. Most
3023    // backends cap a physical batch around 512 tokens; ~1600 chars stays
3024    // comfortably under that for typical English/code (≈4 chars/token).
3025    truncate_chars(&text, MAX_EMBED_TEXT_CHARS)
3026}
3027
3028#[cfg(test)]
3029fn build_embed_text(symbol: &Symbol, source: &str, file: &Path, project_root: &Path) -> String {
3030    let line_cache = SourceLineCache::new(source);
3031    build_embed_text_with_lines(symbol, &line_cache, file, project_root)
3032}
3033
3034/// Upper bound on characters in a single chunk's `embed_text`. Keeps any one
3035/// input below typical embedding-backend physical batch limits (~512 tokens)
3036/// so an oversized symbol cannot abort the whole index build.
3037const MAX_EMBED_TEXT_CHARS: usize = 1600;
3038
3039fn truncate_chars(value: &str, max_chars: usize) -> String {
3040    value.chars().take(max_chars).collect()
3041}
3042
3043fn first_leading_doc_comment(line_cache: &SourceLineCache<'_>) -> String {
3044    let Some((start, first)) = line_cache
3045        .lines
3046        .iter()
3047        .enumerate()
3048        .find(|(_, line)| !line.trim().is_empty())
3049    else {
3050        return String::new();
3051    };
3052
3053    let trimmed = first.trim_start();
3054    if trimmed.starts_with("/**") {
3055        let mut comment = Vec::new();
3056        for line in line_cache.lines.iter().skip(start) {
3057            comment.push(*line);
3058            if line.contains("*/") {
3059                break;
3060            }
3061        }
3062        return truncate_chars(&comment.join("\n"), 200);
3063    }
3064
3065    if trimmed.starts_with("///") || trimmed.starts_with("//!") {
3066        let comment = line_cache
3067            .lines
3068            .iter()
3069            .skip(start)
3070            .take_while(|line| {
3071                let trimmed = line.trim_start();
3072                trimmed.starts_with("///") || trimmed.starts_with("//!")
3073            })
3074            .copied()
3075            .collect::<Vec<_>>()
3076            .join("\n");
3077        return truncate_chars(&comment, 200);
3078    }
3079
3080    String::new()
3081}
3082
3083pub fn build_file_summary_chunk(
3084    file: &Path,
3085    project_root: &Path,
3086    source: &str,
3087    top_exports: &[&str],
3088    top_export_signatures: &[Option<&str>],
3089) -> SemanticChunk {
3090    let line_cache = SourceLineCache::new(source);
3091    build_file_summary_chunk_with_lines(
3092        file,
3093        project_root,
3094        &line_cache,
3095        top_exports,
3096        top_export_signatures,
3097    )
3098}
3099
3100fn build_file_summary_chunk_with_lines(
3101    file: &Path,
3102    project_root: &Path,
3103    line_cache: &SourceLineCache<'_>,
3104    top_exports: &[&str],
3105    top_export_signatures: &[Option<&str>],
3106) -> SemanticChunk {
3107    let relative = file.strip_prefix(project_root).unwrap_or(file);
3108    let rel_path = relative.to_string_lossy();
3109    let parent_dir = relative
3110        .parent()
3111        .map(|parent| parent.to_string_lossy().to_string())
3112        .unwrap_or_default();
3113    let name = file
3114        .file_stem()
3115        .map(|stem| stem.to_string_lossy().to_string())
3116        .unwrap_or_default();
3117    let doc = first_leading_doc_comment(line_cache);
3118    let exports = top_exports
3119        .iter()
3120        .take(5)
3121        .copied()
3122        .collect::<Vec<_>>()
3123        .join(",");
3124    let snippet = if doc.is_empty() {
3125        top_export_signatures
3126            .first()
3127            .and_then(|signature| signature.as_deref())
3128            .map(|signature| truncate_chars(signature, 200))
3129            .unwrap_or_default()
3130    } else {
3131        doc.clone()
3132    };
3133
3134    SemanticChunk {
3135        file: file.to_path_buf(),
3136        name,
3137        kind: SymbolKind::FileSummary,
3138        start_line: 0,
3139        end_line: 0,
3140        exported: false,
3141        embed_text: truncate_chars(
3142            &format!(
3143                "file:{rel_path} kind:file-summary name:{} parent:{parent_dir} doc:{doc} exports:{exports}",
3144                file.file_stem()
3145                    .map(|stem| stem.to_string_lossy().to_string())
3146                    .unwrap_or_default()
3147            ),
3148            MAX_EMBED_TEXT_CHARS,
3149        ),
3150        snippet,
3151    }
3152}
3153
3154fn parser_for(
3155    parsers: &mut HashMap<crate::parser::LangId, Parser>,
3156    lang: crate::parser::LangId,
3157) -> Result<&mut Parser, String> {
3158    use std::collections::hash_map::Entry;
3159
3160    match parsers.entry(lang) {
3161        Entry::Occupied(entry) => Ok(entry.into_mut()),
3162        Entry::Vacant(entry) => {
3163            let grammar = grammar_for(lang);
3164            let mut parser = Parser::new();
3165            parser
3166                .set_language(&grammar)
3167                .map_err(|error| error.to_string())?;
3168            Ok(entry.insert(parser))
3169        }
3170    }
3171}
3172
3173pub fn is_semantic_indexed_extension(path: &Path) -> bool {
3174    matches!(
3175        path.extension().and_then(|extension| extension.to_str()),
3176        Some(
3177            "ts" | "tsx"
3178                | "js"
3179                | "jsx"
3180                | "py"
3181                | "rs"
3182                | "go"
3183                | "c"
3184                | "h"
3185                | "cc"
3186                | "cpp"
3187                | "cxx"
3188                | "hpp"
3189                | "hh"
3190                | "zig"
3191                | "cs"
3192                | "sh"
3193                | "bash"
3194                | "zsh"
3195                | "inc"
3196                | "php"
3197                | "sol"
3198                | "scss"
3199                | "vue"
3200                | "yaml"
3201                | "yml"
3202                | "pas"
3203                | "pp"
3204                | "dpr"
3205                | "dpk"
3206                | "lpr",
3207        )
3208    )
3209}
3210
3211fn canonicalize_existing_or_deleted_path(path: &Path) -> PathBuf {
3212    if let Ok(canonical) = fs::canonicalize(path) {
3213        return canonical;
3214    }
3215
3216    let Some(parent) = path.parent() else {
3217        return path.to_path_buf();
3218    };
3219    let Some(file_name) = path.file_name() else {
3220        return path.to_path_buf();
3221    };
3222
3223    fs::canonicalize(parent)
3224        .map(|canonical_parent| canonical_parent.join(file_name))
3225        .unwrap_or_else(|_| path.to_path_buf())
3226}
3227
3228/// Files larger than this are skipped for semantic chunking. The read +
3229/// tree-sitter parse is transiently O(file size) (tree-sitter can use several×
3230/// the source bytes), and `par_iter` collection parses many files at once, so an
3231/// unbounded read here is an OOM vector on a repo with a few multi-MB generated/
3232/// vendored/minified files. A file this large yields almost no useful embedding
3233/// anyway (each chunk's embed_text is clamped to MAX_EMBED_TEXT_CHARS), so we
3234/// track it (0 chunks) instead of reading it — freshness then skips it on later
3235/// refreshes. 4 MiB keeps essentially all hand-written source while capping the
3236/// pathological tail.
3237const MAX_SEMANTIC_FILE_BYTES: u64 = 4 * 1024 * 1024;
3238
3239fn collect_semantic_file(
3240    project_root: &Path,
3241    file: &Path,
3242    parsers: &mut HashMap<crate::parser::LangId, Parser>,
3243) -> Result<(IndexedFileMetadata, Vec<SemanticChunk>), String> {
3244    let metadata = fs::metadata(file).map_err(|error| error.to_string())?;
3245    if !metadata.is_file() {
3246        return Err("not a regular file".to_string());
3247    }
3248    let mtime = metadata.modified().map_err(|error| error.to_string())?;
3249    let size = metadata.len();
3250
3251    if !is_semantic_indexed_extension(file) {
3252        return Err("unsupported file extension".to_string());
3253    }
3254    let lang = detect_language(file).ok_or_else(|| "unsupported file extension".to_string())?;
3255
3256    let mut indexed_metadata = IndexedFileMetadata {
3257        mtime,
3258        size,
3259        content_hash: cache_freshness::zero_hash(),
3260    };
3261
3262    // OOM backstop: skip oversized files before the read + parse (tracked with
3263    // zero chunks by the caller, so freshness won't re-read them every refresh).
3264    if size > MAX_SEMANTIC_FILE_BYTES {
3265        return Ok((indexed_metadata, Vec::new()));
3266    }
3267
3268    let source = fs::read_to_string(file).map_err(|error| error.to_string())?;
3269    indexed_metadata.content_hash = if size <= cache_freshness::CONTENT_HASH_SIZE_CAP {
3270        cache_freshness::hash_bytes(source.as_bytes())
3271    } else {
3272        cache_freshness::zero_hash()
3273    };
3274
3275    let chunks = collect_file_chunks_from_source(project_root, file, lang, parsers, &source)?;
3276    Ok((indexed_metadata, chunks))
3277}
3278
3279#[cfg(test)]
3280fn collect_file_chunks(
3281    project_root: &Path,
3282    file: &Path,
3283    parsers: &mut HashMap<crate::parser::LangId, Parser>,
3284) -> Result<Vec<SemanticChunk>, String> {
3285    if !is_semantic_indexed_extension(file) {
3286        return Err("unsupported file extension".to_string());
3287    }
3288    let lang = detect_language(file).ok_or_else(|| "unsupported file extension".to_string())?;
3289    // OOM backstop: skip oversized files before the read + parse (tracked with
3290    // zero chunks by the caller, so freshness won't re-read them every refresh).
3291    if fs::metadata(file).is_ok_and(|m| m.len() > MAX_SEMANTIC_FILE_BYTES) {
3292        return Ok(Vec::new());
3293    }
3294    let source = fs::read_to_string(file).map_err(|error| error.to_string())?;
3295    collect_file_chunks_from_source(project_root, file, lang, parsers, &source)
3296}
3297
3298fn collect_file_chunks_from_source(
3299    project_root: &Path,
3300    file: &Path,
3301    lang: crate::parser::LangId,
3302    parsers: &mut HashMap<crate::parser::LangId, Parser>,
3303    source: &str,
3304) -> Result<Vec<SemanticChunk>, String> {
3305    let tree = parser_for(parsers, lang)?
3306        .parse(source, None)
3307        .ok_or_else(|| format!("tree-sitter parse returned None for {}", file.display()))?;
3308    let symbols =
3309        extract_symbols_from_tree(source, &tree, lang).map_err(|error| error.to_string())?;
3310
3311    Ok(symbols_to_chunks(file, &symbols, source, project_root))
3312}
3313
3314/// Build a display snippet from a symbol's source
3315fn build_snippet_with_lines(symbol: &Symbol, line_cache: &SourceLineCache<'_>) -> String {
3316    let start = (symbol.range.start_line as usize).min(line_cache.len());
3317    // range.end_line is inclusive 0-based; +1 makes it an exclusive slice bound.
3318    let end = (symbol.range.end_line as usize + 1).min(line_cache.len());
3319    if start < end {
3320        let snippet_lines: Vec<&str> = line_cache.lines[start..end]
3321            .iter()
3322            .take(5)
3323            .copied()
3324            .collect();
3325        let mut snippet = snippet_lines.join("\n");
3326        if end - start > 5 {
3327            snippet.push_str("\n  ...");
3328        }
3329        if snippet.len() > 300 {
3330            snippet = format!("{}...", &snippet[..snippet.floor_char_boundary(300)]);
3331        }
3332        snippet
3333    } else {
3334        String::new()
3335    }
3336}
3337
3338#[cfg(test)]
3339fn build_snippet(symbol: &Symbol, source: &str) -> String {
3340    let line_cache = SourceLineCache::new(source);
3341    build_snippet_with_lines(symbol, &line_cache)
3342}
3343
3344/// Convert symbols to semantic chunks with enriched context
3345fn symbols_to_chunks(
3346    file: &Path,
3347    symbols: &[Symbol],
3348    source: &str,
3349    project_root: &Path,
3350) -> Vec<SemanticChunk> {
3351    let line_cache = SourceLineCache::new(source);
3352    let mut chunks = Vec::new();
3353    let top_exports_with_signatures = symbols
3354        .iter()
3355        .filter(|symbol| {
3356            symbol.exported
3357                && symbol.parent.is_none()
3358                && !matches!(symbol.kind, SymbolKind::Heading)
3359        })
3360        .map(|symbol| (symbol.name.as_str(), symbol.signature.as_deref()))
3361        .collect::<Vec<_>>();
3362
3363    let has_only_headings = !symbols.is_empty()
3364        && symbols
3365            .iter()
3366            .all(|symbol| matches!(symbol.kind, SymbolKind::Heading));
3367    if top_exports_with_signatures.len() <= 2 && !has_only_headings {
3368        let top_exports = top_exports_with_signatures
3369            .iter()
3370            .map(|(name, _)| *name)
3371            .collect::<Vec<_>>();
3372        let top_export_signatures = top_exports_with_signatures
3373            .iter()
3374            .map(|(_, signature)| *signature)
3375            .collect::<Vec<_>>();
3376        chunks.push(build_file_summary_chunk_with_lines(
3377            file,
3378            project_root,
3379            &line_cache,
3380            &top_exports,
3381            &top_export_signatures,
3382        ));
3383    }
3384
3385    for symbol in symbols {
3386        // Skip Markdown / HTML heading chunks: empirically they dominate result
3387        // lists even for code-shaped queries because heading prose embeds well.
3388        // Agents querying for code lose the actual matches under doc noise.
3389        // README/docs queries are still served by grep on the same files.
3390        if matches!(symbol.kind, SymbolKind::Heading) {
3391            continue;
3392        }
3393
3394        // Skip very small symbols (single-line variables, etc.)
3395        let line_count = symbol
3396            .range
3397            .end_line
3398            .saturating_sub(symbol.range.start_line)
3399            + 1;
3400        if line_count < 2 && !matches!(symbol.kind, SymbolKind::Variable) {
3401            continue;
3402        }
3403
3404        let embed_text = build_embed_text_with_lines(symbol, &line_cache, file, project_root);
3405        let snippet = build_snippet_with_lines(symbol, &line_cache);
3406
3407        chunks.push(SemanticChunk {
3408            file: file.to_path_buf(),
3409            name: symbol.name.clone(),
3410            kind: symbol.kind.clone(),
3411            start_line: symbol.range.start_line,
3412            end_line: symbol.range.end_line,
3413            exported: symbol.exported,
3414            embed_text,
3415            snippet,
3416        });
3417
3418        // Note: Nested symbols are handled separately by the outline system
3419        // Each symbol is indexed individually
3420    }
3421
3422    chunks
3423}
3424
3425fn semantic_score_order(a: &(f32, usize), b: &(f32, usize)) -> std::cmp::Ordering {
3426    b.0.partial_cmp(&a.0)
3427        .unwrap_or(std::cmp::Ordering::Equal)
3428        .then_with(|| a.1.cmp(&b.1))
3429}
3430
3431/// Cosine similarity between two vectors
3432fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
3433    if a.len() != b.len() {
3434        return 0.0;
3435    }
3436
3437    let mut dot = 0.0f32;
3438    let mut norm_a = 0.0f32;
3439    let mut norm_b = 0.0f32;
3440
3441    for i in 0..a.len() {
3442        dot += a[i] * b[i];
3443        norm_a += a[i] * a[i];
3444        norm_b += b[i] * b[i];
3445    }
3446
3447    let denom = norm_a.sqrt() * norm_b.sqrt();
3448    if denom == 0.0 {
3449        0.0
3450    } else {
3451        dot / denom
3452    }
3453}
3454
3455// Serialization helpers
3456fn symbol_kind_to_u8(kind: &SymbolKind) -> u8 {
3457    match kind {
3458        SymbolKind::Function => 0,
3459        SymbolKind::Class => 1,
3460        SymbolKind::Method => 2,
3461        SymbolKind::Struct => 3,
3462        SymbolKind::Interface => 4,
3463        SymbolKind::Enum => 5,
3464        SymbolKind::TypeAlias => 6,
3465        SymbolKind::Variable => 7,
3466        SymbolKind::Heading => 8,
3467        SymbolKind::FileSummary => 9,
3468    }
3469}
3470
3471fn u8_to_symbol_kind(v: u8) -> SymbolKind {
3472    match v {
3473        0 => SymbolKind::Function,
3474        1 => SymbolKind::Class,
3475        2 => SymbolKind::Method,
3476        3 => SymbolKind::Struct,
3477        4 => SymbolKind::Interface,
3478        5 => SymbolKind::Enum,
3479        6 => SymbolKind::TypeAlias,
3480        7 => SymbolKind::Variable,
3481        8 => SymbolKind::Heading,
3482        9 => SymbolKind::FileSummary,
3483        _ => SymbolKind::Heading,
3484    }
3485}
3486
3487#[cfg(test)]
3488mod tests {
3489    use super::*;
3490    use crate::config::{SemanticBackend, SemanticBackendConfig};
3491    use crate::parser::FileParser;
3492    use std::io::{Read, Write};
3493    use std::net::TcpListener;
3494    use std::thread;
3495
3496    #[test]
3497    fn semantic_index_includes_php_inc_and_scss_extensions() {
3498        for file in ["partial.inc", "index.php", "styles.scss"] {
3499            assert!(
3500                is_semantic_indexed_extension(Path::new(file)),
3501                "{file} should be semantic-index eligible"
3502            );
3503        }
3504    }
3505
3506    #[test]
3507    fn transient_marker_round_trips_and_classifies() {
3508        // A marked transient error is recognized and the marker is stripped for
3509        // display, leaving a clean message.
3510        let marked = format!("{TRANSIENT_EMBEDDING_MARKER}openai compatible request failed: error sending request for url (http://localhost:1234/v1/embeddings)");
3511        assert!(embedding_failure_is_transient(&marked));
3512        let clean = strip_transient_embedding_marker(&marked);
3513        assert!(!clean.contains(TRANSIENT_EMBEDDING_MARKER));
3514        assert!(clean.starts_with("openai compatible request failed:"));
3515
3516        // Permanent errors (HTTP 4xx, dimension mismatch) carry no marker and
3517        // are not classified transient — they must fail fast.
3518        for permanent in [
3519            "openai compatible request failed (HTTP 401): Unauthorized",
3520            "embedding dimension mismatch: index has 384, model returned 768",
3521            "too many files (>20000) for semantic indexing (max 20000)",
3522        ] {
3523            assert!(
3524                !embedding_failure_is_transient(permanent),
3525                "{permanent:?} must not be transient"
3526            );
3527            // Stripping a marker-free string is a no-op.
3528            assert_eq!(strip_transient_embedding_marker(permanent), permanent);
3529        }
3530    }
3531
3532    #[test]
3533    fn send_error_transience_separates_connect_timeout_from_4xx() {
3534        // 5xx / 429 are transient; other client errors are not.
3535        assert!(is_retryable_embedding_status(
3536            reqwest::StatusCode::INTERNAL_SERVER_ERROR
3537        ));
3538        assert!(is_retryable_embedding_status(
3539            reqwest::StatusCode::TOO_MANY_REQUESTS
3540        ));
3541        assert!(!is_retryable_embedding_status(
3542            reqwest::StatusCode::UNAUTHORIZED
3543        ));
3544        assert!(!is_retryable_embedding_status(
3545            reqwest::StatusCode::BAD_REQUEST
3546        ));
3547    }
3548
3549    #[test]
3550    fn local_backend_model_loading_body_is_transient() {
3551        // LM Studio / Ollama return a 4xx with a loading/unloaded message while
3552        // the model swaps; these must classify transient so the build self-heals.
3553        for body in [
3554            r#"{"error":"Model was unloaded while the request was still in queue.."}"#,
3555            r#"{"error":"model is loading, please wait"}"#,
3556            r#"{"error":"Model not loaded"}"#,
3557            "Loading model into memory",
3558        ] {
3559            assert!(
3560                embedding_response_body_is_transient(reqwest::StatusCode::BAD_REQUEST, body),
3561                "{body:?} should be body-transient"
3562            );
3563        }
3564
3565        // A genuine 4xx misconfiguration body must NOT be treated as transient,
3566        // even when it happens to contain generic words from the old broad
3567        // substring matcher.
3568        for body in [
3569            r#"{"error":"invalid api key"}"#,
3570            r#"{"error":"model 'foo' not found"}"#,
3571            "Bad Request: unknown field",
3572            "Bad Request: invalid loading model option",
3573            r#"{"error":"unauthorized while model is being loaded by another account"}"#,
3574        ] {
3575            assert!(
3576                !embedding_response_body_is_transient(reqwest::StatusCode::BAD_REQUEST, body),
3577                "{body:?} must not be body-transient"
3578            );
3579        }
3580
3581        assert!(
3582            !embedding_response_body_is_transient(
3583                reqwest::StatusCode::UNAUTHORIZED,
3584                r#"{"error":"model is loading, please wait"}"#
3585            ),
3586            "permanent auth failures must not become transient because of body text"
3587        );
3588    }
3589
3590    fn start_mock_http_server<F>(handler: F) -> (String, thread::JoinHandle<()>)
3591    where
3592        F: Fn(String, String, String) -> String + Send + 'static,
3593    {
3594        let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
3595        let addr = listener.local_addr().expect("local addr");
3596        let handle = thread::spawn(move || {
3597            let (mut stream, _) = listener.accept().expect("accept request");
3598            let mut buf = Vec::new();
3599            let mut chunk = [0u8; 4096];
3600            let mut header_end = None;
3601            let mut content_length = 0usize;
3602            loop {
3603                let n = stream.read(&mut chunk).expect("read request");
3604                if n == 0 {
3605                    break;
3606                }
3607                buf.extend_from_slice(&chunk[..n]);
3608                if header_end.is_none() {
3609                    if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
3610                        header_end = Some(pos + 4);
3611                        let headers = String::from_utf8_lossy(&buf[..pos + 4]);
3612                        for line in headers.lines() {
3613                            if let Some(value) = line.strip_prefix("Content-Length:") {
3614                                content_length = value.trim().parse::<usize>().unwrap_or(0);
3615                            }
3616                        }
3617                    }
3618                }
3619                if let Some(end) = header_end {
3620                    if buf.len() >= end + content_length {
3621                        break;
3622                    }
3623                }
3624            }
3625
3626            let end = header_end.expect("header terminator");
3627            let request = String::from_utf8_lossy(&buf[..end]).to_string();
3628            let body = String::from_utf8_lossy(&buf[end..end + content_length]).to_string();
3629            let mut lines = request.lines();
3630            let request_line = lines.next().expect("request line").to_string();
3631            let path = request_line
3632                .split_whitespace()
3633                .nth(1)
3634                .expect("request path")
3635                .to_string();
3636            let response_body = handler(request_line, path, body);
3637            let response = format!(
3638                "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
3639                response_body.len(),
3640                response_body
3641            );
3642            stream
3643                .write_all(response.as_bytes())
3644                .expect("write response");
3645        });
3646
3647        (format!("http://{}", addr), handle)
3648    }
3649
3650    fn start_truncated_body_server(attempts: usize) -> (String, thread::JoinHandle<()>) {
3651        let listener = TcpListener::bind("127.0.0.1:0").expect("bind truncated test server");
3652        listener
3653            .set_nonblocking(true)
3654            .expect("nonblocking listener");
3655        let addr = listener.local_addr().expect("local addr");
3656        let handle = thread::spawn(move || {
3657            let deadline = std::time::Instant::now() + Duration::from_secs(2);
3658            let mut accepted = 0usize;
3659            while accepted < attempts && std::time::Instant::now() < deadline {
3660                match listener.accept() {
3661                    Ok((mut stream, _)) => {
3662                        accepted += 1;
3663                        let mut buf = [0u8; 4096];
3664                        // The client (under test) uses a 250ms timeout and drops
3665                        // the connection when the truncated body never completes.
3666                        // On Windows that disconnect surfaces as a hard socket
3667                        // error (WSAECONNRESET) on these read/write calls, where
3668                        // Unix returns a clean EOF. Tolerate both: the mock does
3669                        // not need the request bytes, and a write to an
3670                        // already-hung-up client is expected.
3671                        let _ = stream.read(&mut buf);
3672                        let response = "HTTP/1.1 200 OK
3673Content-Type: application/json
3674Content-Length: 128
3675Connection: close
3676
3677{";
3678                        let _ = stream.write_all(response.as_bytes());
3679                    }
3680                    Err(error) if error.kind() == std::io::ErrorKind::WouldBlock => {
3681                        thread::sleep(Duration::from_millis(10));
3682                    }
3683                    Err(error) => panic!("accept request: {error}"),
3684                }
3685            }
3686        });
3687
3688        (format!("http://{}", addr), handle)
3689    }
3690
3691    #[test]
3692    fn response_body_read_failures_are_marked_transient() {
3693        let (url, handle) = start_truncated_body_server(EMBEDDING_REQUEST_MAX_ATTEMPTS);
3694        let client = Client::builder()
3695            .timeout(Duration::from_millis(250))
3696            .build()
3697            .expect("client");
3698
3699        let error = send_embedding_request(|| client.post(&url).body("{}"), "test backend")
3700            .expect_err("truncated body should fail");
3701
3702        handle.join().unwrap();
3703        assert!(
3704            embedding_failure_is_transient(&error),
3705            "body read failures should be transient-marked: {error}"
3706        );
3707        assert!(error.contains("response read failed"));
3708    }
3709
3710    fn test_vector_for_texts(texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
3711        Ok(texts.iter().map(|_| vec![1.0, 0.0, 0.0]).collect())
3712    }
3713
3714    fn write_rust_file(path: &Path, function_name: &str) {
3715        fs::write(
3716            path,
3717            format!("pub fn {function_name}() -> bool {{\n    true\n}}\n"),
3718        )
3719        .unwrap();
3720    }
3721
3722    fn build_test_index(project_root: &Path, files: &[PathBuf]) -> SemanticIndex {
3723        let mut embed = test_vector_for_texts;
3724        SemanticIndex::build(project_root, files, &mut embed, 8).unwrap()
3725    }
3726
3727    fn test_project_root() -> PathBuf {
3728        std::env::current_dir().unwrap()
3729    }
3730
3731    fn set_file_metadata(index: &mut SemanticIndex, file: &Path, mtime: SystemTime, size: u64) {
3732        index.file_mtimes.insert(file.to_path_buf(), mtime);
3733        index.file_sizes.insert(file.to_path_buf(), size);
3734        index
3735            .file_hashes
3736            .insert(file.to_path_buf(), cache_freshness::zero_hash());
3737    }
3738
3739    fn legacy_semantic_index_bytes(index: &SemanticIndex) -> Vec<u8> {
3740        let mut buf = Vec::new();
3741        let fingerprint_bytes = index.fingerprint.as_ref().and_then(|fingerprint| {
3742            let encoded = fingerprint.as_string();
3743            if encoded.is_empty() {
3744                None
3745            } else {
3746                Some(encoded.into_bytes())
3747            }
3748        });
3749        let file_mtimes: Vec<_> = index
3750            .file_mtimes
3751            .iter()
3752            .filter_map(|(path, mtime)| {
3753                cache_relative_path(&index.project_root, path)
3754                    .map(|relative| (relative, path, mtime))
3755            })
3756            .collect();
3757        let entries: Vec<_> = index
3758            .entries
3759            .iter()
3760            .filter_map(|entry| {
3761                cache_relative_path(&index.project_root, &entry.chunk.file)
3762                    .map(|relative| (relative, entry))
3763            })
3764            .collect();
3765
3766        buf.push(SEMANTIC_INDEX_VERSION_V6);
3767        buf.extend_from_slice(&(index.dimension as u32).to_le_bytes());
3768        buf.extend_from_slice(&(entries.len() as u32).to_le_bytes());
3769        let fp_bytes_ref: &[u8] = fingerprint_bytes.as_deref().unwrap_or(&[]);
3770        buf.extend_from_slice(&(fp_bytes_ref.len() as u32).to_le_bytes());
3771        buf.extend_from_slice(fp_bytes_ref);
3772
3773        buf.extend_from_slice(&(file_mtimes.len() as u32).to_le_bytes());
3774        for (relative, path, mtime) in &file_mtimes {
3775            let path_bytes = relative.to_string_lossy().as_bytes().to_vec();
3776            buf.extend_from_slice(&(path_bytes.len() as u32).to_le_bytes());
3777            buf.extend_from_slice(&path_bytes);
3778            let duration = mtime
3779                .duration_since(SystemTime::UNIX_EPOCH)
3780                .unwrap_or_default();
3781            buf.extend_from_slice(&duration.as_secs().to_le_bytes());
3782            buf.extend_from_slice(&duration.subsec_nanos().to_le_bytes());
3783            let size = index.file_sizes.get(*path).copied().unwrap_or_default();
3784            buf.extend_from_slice(&size.to_le_bytes());
3785            let hash = index
3786                .file_hashes
3787                .get(*path)
3788                .copied()
3789                .unwrap_or_else(cache_freshness::zero_hash);
3790            buf.extend_from_slice(hash.as_bytes());
3791        }
3792
3793        for (relative, entry) in &entries {
3794            let c = &entry.chunk;
3795            let file_bytes = relative.to_string_lossy().as_bytes().to_vec();
3796            buf.extend_from_slice(&(file_bytes.len() as u32).to_le_bytes());
3797            buf.extend_from_slice(&file_bytes);
3798
3799            let name_bytes = c.name.as_bytes();
3800            buf.extend_from_slice(&(name_bytes.len() as u32).to_le_bytes());
3801            buf.extend_from_slice(name_bytes);
3802
3803            buf.push(symbol_kind_to_u8(&c.kind));
3804            buf.extend_from_slice(&(c.start_line as u32).to_le_bytes());
3805            buf.extend_from_slice(&(c.end_line as u32).to_le_bytes());
3806            buf.push(c.exported as u8);
3807
3808            let snippet_bytes = c.snippet.as_bytes();
3809            buf.extend_from_slice(&(snippet_bytes.len() as u32).to_le_bytes());
3810            buf.extend_from_slice(snippet_bytes);
3811
3812            let embed_bytes = c.embed_text.as_bytes();
3813            buf.extend_from_slice(&(embed_bytes.len() as u32).to_le_bytes());
3814            buf.extend_from_slice(embed_bytes);
3815
3816            for &val in &entry.vector {
3817                buf.extend_from_slice(&val.to_le_bytes());
3818            }
3819        }
3820
3821        buf
3822    }
3823
3824    #[derive(Default)]
3825    struct RecordingEmbedder {
3826        calls: Vec<Vec<String>>,
3827    }
3828
3829    impl RecordingEmbedder {
3830        fn embed(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
3831            let vectors = texts
3832                .iter()
3833                .map(|text| deterministic_test_vector(text))
3834                .collect();
3835            self.calls.push(texts);
3836            Ok(vectors)
3837        }
3838
3839        fn total_embedded_texts(&self) -> usize {
3840            self.calls.iter().map(Vec::len).sum()
3841        }
3842
3843        fn embedded_texts(&self) -> Vec<&str> {
3844            self.calls
3845                .iter()
3846                .flat_map(|batch| batch.iter().map(String::as_str))
3847                .collect()
3848        }
3849    }
3850
3851    fn deterministic_test_vector(text: &str) -> Vec<f32> {
3852        let hash = blake3::hash(text.as_bytes());
3853        let bytes = hash.as_bytes();
3854        vec![
3855            1.0,
3856            bytes[0] as f32 / 255.0,
3857            bytes[1] as f32 / 255.0,
3858            bytes[2] as f32 / 255.0,
3859        ]
3860    }
3861
3862    fn build_recorded_test_index(project_root: &Path, files: &[PathBuf]) -> SemanticIndex {
3863        let mut embedder = RecordingEmbedder::default();
3864        let mut embed = |texts: Vec<String>| embedder.embed(texts);
3865        SemanticIndex::build(project_root, files, &mut embed, 16).unwrap()
3866    }
3867
3868    fn force_stale(index: &mut SemanticIndex, file: &Path) {
3869        set_file_metadata(index, file, SystemTime::UNIX_EPOCH, 0);
3870    }
3871
3872    fn write_source(path: &Path, source: &str) {
3873        if let Some(parent) = path.parent() {
3874            fs::create_dir_all(parent).unwrap();
3875        }
3876        fs::write(path, source).unwrap();
3877    }
3878
3879    fn entries_for_file<'a>(index: &'a SemanticIndex, file: &Path) -> Vec<&'a EmbeddingEntry> {
3880        index
3881            .entries
3882            .iter()
3883            .filter(|entry| entry.chunk.file == file)
3884            .collect()
3885    }
3886
3887    fn entry_by_name<'a>(index: &'a SemanticIndex, file: &Path, name: &str) -> &'a EmbeddingEntry {
3888        index
3889            .entries
3890            .iter()
3891            .find(|entry| entry.chunk.file == file && entry.chunk.name == name)
3892            .unwrap_or_else(|| panic!("missing semantic entry {name} in {}", file.display()))
3893    }
3894
3895    fn file_summary_entry<'a>(index: &'a SemanticIndex, file: &Path) -> &'a EmbeddingEntry {
3896        index
3897            .entries
3898            .iter()
3899            .find(|entry| entry.chunk.file == file && entry.chunk.kind == SymbolKind::FileSummary)
3900            .unwrap_or_else(|| panic!("missing file-summary entry in {}", file.display()))
3901    }
3902
3903    #[test]
3904    fn refresh_stale_line_shift_reuses_all_chunks_and_retains_entries() {
3905        let temp = tempfile::tempdir().unwrap();
3906        let project_root = temp.path();
3907        let file = project_root.join("src/lib.rs");
3908        let original = "pub fn alpha() -> i32 {\n    1\n}\n\npub fn beta() -> i32 {\n    2\n}\n";
3909        write_source(&file, original);
3910
3911        let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
3912        let original_entry_count = index.entries.len();
3913        let original_alpha_vector = entry_by_name(&index, &file, "alpha").vector.clone();
3914
3915        write_source(&file, &format!("\n{original}"));
3916        force_stale(&mut index, &file);
3917
3918        let mut embedder = RecordingEmbedder::default();
3919        let mut embed = |texts: Vec<String>| embedder.embed(texts);
3920        let mut progress = |_done: usize, _total: usize| {};
3921        let summary = index
3922            .refresh_stale_files(
3923                project_root,
3924                std::slice::from_ref(&file),
3925                &mut embed,
3926                16,
3927                &mut progress,
3928            )
3929            .unwrap();
3930
3931        assert_eq!(summary.changed, 1);
3932        assert_eq!(embedder.total_embedded_texts(), 0);
3933        assert_eq!(index.entries.len(), original_entry_count);
3934        let shifted_alpha = entry_by_name(&index, &file, "alpha");
3935        assert_eq!(shifted_alpha.chunk.start_line, 1);
3936        assert_eq!(shifted_alpha.vector, original_alpha_vector);
3937    }
3938
3939    #[test]
3940    fn refresh_invalidated_line_shift_emits_full_replacement_delta_for_apply() {
3941        let temp = tempfile::tempdir().unwrap();
3942        let project_root = temp.path();
3943        let file = project_root.join("src/lib.rs");
3944        let original = "pub fn alpha() -> i32 {\n    1\n}\n\npub fn beta() -> i32 {\n    2\n}\n";
3945        write_source(&file, original);
3946
3947        let mut worker_index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
3948        let mut serving_index = worker_index.clone();
3949        let original_entry_count = worker_index.entries.len();
3950
3951        write_source(&file, &format!("\n{original}"));
3952
3953        let mut embedder = RecordingEmbedder::default();
3954        let mut embed = |texts: Vec<String>| embedder.embed(texts);
3955        let mut progress = |_done: usize, _total: usize| {};
3956        let update = worker_index
3957            .refresh_invalidated_files(
3958                project_root,
3959                std::slice::from_ref(&file),
3960                &mut embed,
3961                16,
3962                100,
3963                &mut progress,
3964            )
3965            .unwrap();
3966
3967        assert_eq!(embedder.total_embedded_texts(), 0);
3968        assert_eq!(update.added_entries.len(), original_entry_count);
3969        assert_eq!(worker_index.entries.len(), original_entry_count);
3970
3971        serving_index.apply_refresh_update(
3972            update.added_entries,
3973            update.updated_metadata,
3974            &update.completed_paths,
3975        );
3976
3977        assert_eq!(serving_index.entries.len(), original_entry_count);
3978        assert_eq!(
3979            entries_for_file(&serving_index, &file).len(),
3980            original_entry_count
3981        );
3982        assert_eq!(
3983            entry_by_name(&serving_index, &file, "alpha")
3984                .chunk
3985                .start_line,
3986            1
3987        );
3988    }
3989
3990    #[test]
3991    fn refresh_invalidated_one_symbol_edit_embeds_only_changed_symbol() {
3992        let temp = tempfile::tempdir().unwrap();
3993        let project_root = temp.path();
3994        let file = project_root.join("src/lib.rs");
3995        write_source(
3996            &file,
3997            "pub fn alpha() -> i32 {\n    1\n}\n\npub fn beta() -> i32 {\n    2\n}\n",
3998        );
3999
4000        let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
4001        let original_entry_count = index.entries.len();
4002        let beta_vector = entry_by_name(&index, &file, "beta").vector.clone();
4003
4004        write_source(
4005            &file,
4006            "pub fn alpha() -> i32 {\n    10\n}\n\npub fn beta() -> i32 {\n    2\n}\n",
4007        );
4008
4009        let mut embedder = RecordingEmbedder::default();
4010        let mut embed = |texts: Vec<String>| embedder.embed(texts);
4011        let mut progress = |_done: usize, _total: usize| {};
4012        let update = index
4013            .refresh_invalidated_files(
4014                project_root,
4015                std::slice::from_ref(&file),
4016                &mut embed,
4017                16,
4018                100,
4019                &mut progress,
4020            )
4021            .unwrap();
4022
4023        assert_eq!(embedder.total_embedded_texts(), 1);
4024        assert!(embedder.embedded_texts()[0].contains("name:alpha"));
4025        assert_eq!(update.added_entries.len(), original_entry_count);
4026        assert_eq!(entry_by_name(&index, &file, "beta").vector, beta_vector);
4027    }
4028
4029    #[test]
4030    fn refresh_reuses_one_old_vector_for_two_byte_identical_symbols() {
4031        let temp = tempfile::tempdir().unwrap();
4032        let project_root = temp.path();
4033        let file = project_root.join("src/dupe.js");
4034        let one_duplicate = "function duplicate() {\n  return 1;\n}\n";
4035        write_source(&file, one_duplicate);
4036
4037        let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
4038        let original_vector = entry_by_name(&index, &file, "duplicate").vector.clone();
4039
4040        write_source(&file, &format!("{one_duplicate}\n{one_duplicate}"));
4041
4042        let mut embedder = RecordingEmbedder::default();
4043        let mut embed = |texts: Vec<String>| embedder.embed(texts);
4044        let mut progress = |_done: usize, _total: usize| {};
4045        index
4046            .refresh_invalidated_files(
4047                project_root,
4048                std::slice::from_ref(&file),
4049                &mut embed,
4050                16,
4051                100,
4052                &mut progress,
4053            )
4054            .unwrap();
4055
4056        let duplicate_entries = index
4057            .entries
4058            .iter()
4059            .filter(|entry| entry.chunk.file == file && entry.chunk.name == "duplicate")
4060            .collect::<Vec<_>>();
4061        assert_eq!(duplicate_entries.len(), 2);
4062        assert_eq!(embedder.total_embedded_texts(), 0);
4063        assert_eq!(duplicate_entries[0].vector, original_vector);
4064        assert_eq!(duplicate_entries[1].vector, original_vector);
4065    }
4066
4067    #[test]
4068    fn file_summary_reuses_on_body_edit_and_misses_on_leading_doc_edit() {
4069        let temp = tempfile::tempdir().unwrap();
4070        let project_root = temp.path();
4071        let file = project_root.join("src/lib.rs");
4072        write_source(
4073            &file,
4074            "//! module docs v1\n\npub fn alpha() -> i32 {\n    1\n}\n",
4075        );
4076
4077        let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
4078        let summary_before = file_summary_entry(&index, &file).vector.clone();
4079
4080        write_source(
4081            &file,
4082            "//! module docs v1\n\npub fn alpha() -> i32 {\n    2\n}\n",
4083        );
4084        let mut body_embedder = RecordingEmbedder::default();
4085        let mut body_embed = |texts: Vec<String>| body_embedder.embed(texts);
4086        let mut progress = |_done: usize, _total: usize| {};
4087        index
4088            .refresh_invalidated_files(
4089                project_root,
4090                std::slice::from_ref(&file),
4091                &mut body_embed,
4092                16,
4093                100,
4094                &mut progress,
4095            )
4096            .unwrap();
4097        assert_eq!(body_embedder.total_embedded_texts(), 1);
4098        assert!(body_embedder.embedded_texts()[0].contains("name:alpha"));
4099        assert_eq!(file_summary_entry(&index, &file).vector, summary_before);
4100
4101        write_source(
4102            &file,
4103            "//! module docs v2\n\npub fn alpha() -> i32 {\n    2\n}\n",
4104        );
4105        let mut doc_embedder = RecordingEmbedder::default();
4106        let mut doc_embed = |texts: Vec<String>| doc_embedder.embed(texts);
4107        index
4108            .refresh_invalidated_files(
4109                project_root,
4110                std::slice::from_ref(&file),
4111                &mut doc_embed,
4112                16,
4113                100,
4114                &mut progress,
4115            )
4116            .unwrap();
4117
4118        assert_eq!(doc_embedder.total_embedded_texts(), 1);
4119        assert!(doc_embedder.embedded_texts()[0].contains("kind:file-summary"));
4120        assert_ne!(file_summary_entry(&index, &file).vector, summary_before);
4121    }
4122
4123    #[test]
4124    fn refresh_invalidated_deleted_file_drops_entries_without_embedding() {
4125        let temp = tempfile::tempdir().unwrap();
4126        let project_root = temp.path();
4127        let file = project_root.join("src/lib.rs");
4128        write_source(&file, "pub fn alpha() -> i32 {\n    1\n}\n");
4129
4130        let mut worker_index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
4131        let mut serving_index = worker_index.clone();
4132        fs::remove_file(&file).unwrap();
4133
4134        let mut embedder = RecordingEmbedder::default();
4135        let mut embed = |texts: Vec<String>| embedder.embed(texts);
4136        let mut progress = |_done: usize, _total: usize| {};
4137        let update = worker_index
4138            .refresh_invalidated_files(
4139                project_root,
4140                std::slice::from_ref(&file),
4141                &mut embed,
4142                16,
4143                100,
4144                &mut progress,
4145            )
4146            .unwrap();
4147
4148        assert_eq!(update.summary.deleted, 1);
4149        assert_eq!(embedder.total_embedded_texts(), 0);
4150        assert!(worker_index.entries.is_empty());
4151
4152        serving_index.apply_refresh_update(
4153            update.added_entries,
4154            update.updated_metadata,
4155            &update.completed_paths,
4156        );
4157        assert!(serving_index.entries.is_empty());
4158    }
4159
4160    #[test]
4161    fn watcher_collect_failure_does_not_resurrect_stale_entries() {
4162        let temp = tempfile::tempdir().unwrap();
4163        let project_root = temp.path();
4164        let file = project_root.join("src/lib.rs");
4165        write_source(&file, "pub fn alpha() -> i32 {\n    1\n}\n");
4166
4167        let mut worker_index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
4168        let mut serving_index = worker_index.clone();
4169        fs::write(&file, [0xff, 0xfe, 0xfd]).unwrap();
4170
4171        let mut embedder = RecordingEmbedder::default();
4172        let mut embed = |texts: Vec<String>| embedder.embed(texts);
4173        let mut progress = |_done: usize, _total: usize| {};
4174        let update = worker_index
4175            .refresh_invalidated_files(
4176                project_root,
4177                std::slice::from_ref(&file),
4178                &mut embed,
4179                16,
4180                100,
4181                &mut progress,
4182            )
4183            .unwrap();
4184
4185        assert_eq!(embedder.total_embedded_texts(), 0);
4186        assert!(update.added_entries.is_empty());
4187        assert!(worker_index.entries.is_empty());
4188        assert!(!worker_index.file_mtimes.contains_key(&file));
4189
4190        serving_index.apply_refresh_update(
4191            update.added_entries,
4192            update.updated_metadata,
4193            &update.completed_paths,
4194        );
4195        assert!(serving_index.entries.is_empty());
4196        assert!(!serving_index.file_mtimes.contains_key(&file));
4197    }
4198
4199    #[test]
4200    fn refresh_invalidated_cap_deferral_remains_file_count_based() {
4201        let temp = tempfile::tempdir().unwrap();
4202        let project_root = temp.path();
4203        let indexed = project_root.join("src/a.rs");
4204        let deferred = project_root.join("src/b.rs");
4205        write_source(&indexed, "pub fn alpha() -> i32 {\n    1\n}\n");
4206        write_source(&deferred, "pub fn beta() -> i32 {\n    2\n}\n");
4207
4208        let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&indexed));
4209        let mut embedder = RecordingEmbedder::default();
4210        let mut embed = |texts: Vec<String>| embedder.embed(texts);
4211        let mut progress = |_done: usize, _total: usize| {};
4212        let update = index
4213            .refresh_invalidated_files(
4214                project_root,
4215                std::slice::from_ref(&deferred),
4216                &mut embed,
4217                16,
4218                1,
4219                &mut progress,
4220            )
4221            .unwrap();
4222
4223        assert_eq!(update.summary.total_processed, 1);
4224        assert_eq!(update.summary.added, 0);
4225        assert_eq!(embedder.total_embedded_texts(), 0);
4226        assert_eq!(index.indexed_file_count(), 1);
4227        assert!(index.deferred_files.contains(&deferred));
4228        assert!(entries_for_file(&index, &deferred).is_empty());
4229    }
4230
4231    #[test]
4232    fn semantic_cache_serialization_skips_paths_outside_project_root() {
4233        let dir = tempfile::tempdir().expect("create temp dir");
4234        let project = fs::canonicalize(dir.path()).expect("canonical project");
4235        let outside = project.join("..").join("outside.rs");
4236        let mut index = SemanticIndex::new(project.clone(), 3);
4237        index
4238            .file_mtimes
4239            .insert(outside.clone(), SystemTime::UNIX_EPOCH);
4240        index.file_sizes.insert(outside.clone(), 1);
4241        index
4242            .file_hashes
4243            .insert(outside.clone(), cache_freshness::zero_hash());
4244        index.entries.push(EmbeddingEntry {
4245            chunk: SemanticChunk {
4246                file: outside,
4247                name: "outside".to_string(),
4248                kind: SymbolKind::Function,
4249                start_line: 0,
4250                end_line: 0,
4251                exported: false,
4252                embed_text: "outside".to_string(),
4253                snippet: "outside".to_string(),
4254            },
4255            vector: vec![1.0, 0.0, 0.0],
4256        });
4257
4258        let bytes = index.to_bytes();
4259        let loaded = SemanticIndex::from_bytes(&bytes, &project).expect("load serialized index");
4260        assert_eq!(loaded.entries.len(), 0);
4261        assert!(loaded.file_mtimes.is_empty());
4262    }
4263
4264    #[test]
4265    fn semantic_search_bounded_top_k_matches_reference_full_sort() {
4266        let project_root = test_project_root();
4267        let file = project_root.join("src/lib.rs");
4268        let mut index = SemanticIndex::new(project_root, 2);
4269        let entries = [
4270            ("alpha", vec![1.0, 0.0], false),
4271            ("beta", vec![0.0, 1.0], false),
4272            ("gamma", vec![1.0, 0.0], false),
4273            ("delta", vec![0.5, 0.5], true),
4274            ("epsilon", vec![-1.0, 0.0], false),
4275        ];
4276        for (line, (name, vector, exported)) in entries.into_iter().enumerate() {
4277            index.entries.push(EmbeddingEntry {
4278                chunk: SemanticChunk {
4279                    file: file.clone(),
4280                    name: name.to_string(),
4281                    kind: SymbolKind::Function,
4282                    start_line: line as u32 + 1,
4283                    end_line: line as u32 + 1,
4284                    exported,
4285                    embed_text: name.to_string(),
4286                    snippet: format!("fn {name}() {{}}"),
4287                },
4288                vector,
4289            });
4290        }
4291
4292        let query = vec![1.0, 0.0];
4293        let top_k = 4;
4294        let mut reference: Vec<(f32, usize)> = index
4295            .entries
4296            .iter()
4297            .enumerate()
4298            .map(|(idx, entry)| {
4299                let mut score = cosine_similarity(&query, &entry.vector);
4300                if entry.chunk.exported {
4301                    score *= 1.1;
4302                }
4303                (score, idx)
4304            })
4305            .collect();
4306        reference.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
4307        let expected: Vec<(String, f32)> = reference
4308            .into_iter()
4309            .take(top_k)
4310            .map(|(score, idx)| (index.entries[idx].chunk.name.clone(), score))
4311            .collect();
4312
4313        let actual: Vec<(String, f32)> = index
4314            .search(&query, top_k)
4315            .into_iter()
4316            .map(|result| (result.name, result.score))
4317            .collect();
4318
4319        assert_eq!(
4320            actual.iter().map(|(name, _)| name).collect::<Vec<_>>(),
4321            expected.iter().map(|(name, _)| name).collect::<Vec<_>>()
4322        );
4323        for ((_, actual_score), (_, expected_score)) in actual.iter().zip(expected.iter()) {
4324            assert!((actual_score - expected_score).abs() < 1e-6);
4325        }
4326        assert_eq!(actual[0].0, "alpha");
4327        assert_eq!(actual[1].0, "gamma", "equal scores keep insertion order");
4328        assert!(index.search(&query, 0).is_empty());
4329    }
4330
4331    #[test]
4332    fn test_cosine_similarity_identical() {
4333        let a = vec![1.0, 0.0, 0.0];
4334        let b = vec![1.0, 0.0, 0.0];
4335        assert!((cosine_similarity(&a, &b) - 1.0).abs() < 0.001);
4336    }
4337
4338    #[test]
4339    fn test_cosine_similarity_orthogonal() {
4340        let a = vec![1.0, 0.0, 0.0];
4341        let b = vec![0.0, 1.0, 0.0];
4342        assert!(cosine_similarity(&a, &b).abs() < 0.001);
4343    }
4344
4345    #[test]
4346    fn test_cosine_similarity_opposite() {
4347        let a = vec![1.0, 0.0, 0.0];
4348        let b = vec![-1.0, 0.0, 0.0];
4349        assert!((cosine_similarity(&a, &b) + 1.0).abs() < 0.001);
4350    }
4351
4352    #[test]
4353    fn test_serialization_roundtrip() {
4354        let project_root = test_project_root();
4355        let file = project_root.join("src/main.rs");
4356        let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
4357        index.entries.push(EmbeddingEntry {
4358            chunk: SemanticChunk {
4359                file: file.clone(),
4360                name: "handle_request".to_string(),
4361                kind: SymbolKind::Function,
4362                start_line: 10,
4363                end_line: 25,
4364                exported: true,
4365                embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
4366                snippet: "fn handle_request() {\n  // ...\n}".to_string(),
4367            },
4368            vector: vec![0.1, 0.2, 0.3, 0.4],
4369        });
4370        index.dimension = 4;
4371        index
4372            .file_mtimes
4373            .insert(file.clone(), SystemTime::UNIX_EPOCH);
4374        index.file_sizes.insert(file, 0);
4375        index.set_fingerprint(SemanticIndexFingerprint {
4376            backend: "fastembed".to_string(),
4377            model: "all-MiniLM-L6-v2".to_string(),
4378            base_url: FALLBACK_BACKEND.to_string(),
4379            dimension: 4,
4380            chunking_version: default_chunking_version(),
4381        });
4382
4383        let bytes = index.to_bytes();
4384        let restored = SemanticIndex::from_bytes(&bytes, &project_root).unwrap();
4385
4386        assert_eq!(restored.entries.len(), 1);
4387        assert_eq!(restored.entries[0].chunk.name, "handle_request");
4388        assert_eq!(restored.entries[0].vector, vec![0.1, 0.2, 0.3, 0.4]);
4389        assert_eq!(restored.dimension, 4);
4390        assert_eq!(restored.backend_label(), Some("fastembed"));
4391        assert_eq!(restored.model_label(), Some("all-MiniLM-L6-v2"));
4392    }
4393
4394    #[test]
4395    fn semantic_cache_streaming_persistence_matches_legacy_bytes_and_round_trips() {
4396        let storage = tempfile::tempdir().expect("create storage dir");
4397        let project = storage.path().join("project");
4398        fs::create_dir_all(project.join("src")).expect("create project src");
4399        let file = project.join("src/lib.rs");
4400        fs::write(&file, "pub fn alpha() {}\npub fn beta() {}\n").expect("write source");
4401        let project_root = fs::canonicalize(&project).expect("canonical project");
4402        let file = fs::canonicalize(&file).expect("canonical file");
4403
4404        let mut index = SemanticIndex::new(project_root.clone(), 3);
4405        let mtime = SystemTime::UNIX_EPOCH + Duration::new(123, 456);
4406        index.file_mtimes.insert(file.clone(), mtime);
4407        index.file_sizes.insert(file.clone(), 42);
4408        index
4409            .file_hashes
4410            .insert(file.clone(), cache_freshness::zero_hash());
4411        index.entries.push(EmbeddingEntry {
4412            chunk: SemanticChunk {
4413                file: file.clone(),
4414                name: "alpha".to_string(),
4415                kind: SymbolKind::Function,
4416                start_line: 0,
4417                end_line: 0,
4418                exported: true,
4419                embed_text: "file:src/lib.rs kind:function name:alpha".to_string(),
4420                snippet: "pub fn alpha() {}".to_string(),
4421            },
4422            vector: vec![0.1, 0.2, 0.3],
4423        });
4424        index.entries.push(EmbeddingEntry {
4425            chunk: SemanticChunk {
4426                file: file.clone(),
4427                name: "beta".to_string(),
4428                kind: SymbolKind::Function,
4429                start_line: 1,
4430                end_line: 1,
4431                exported: true,
4432                embed_text: "file:src/lib.rs kind:function name:beta".to_string(),
4433                snippet: "pub fn beta() {}".to_string(),
4434            },
4435            vector: vec![0.4, 0.5, 0.6],
4436        });
4437        let fingerprint = SemanticIndexFingerprint {
4438            backend: "fastembed".to_string(),
4439            model: "all-MiniLM-L6-v2".to_string(),
4440            base_url: FALLBACK_BACKEND.to_string(),
4441            dimension: 3,
4442            chunking_version: default_chunking_version(),
4443        };
4444        index.set_fingerprint(fingerprint.clone());
4445
4446        let legacy_bytes = legacy_semantic_index_bytes(&index);
4447        assert_eq!(index.to_bytes(), legacy_bytes);
4448
4449        index.write_to_disk(storage.path(), "proj");
4450        let data_path = storage.path().join("semantic/proj/semantic.bin");
4451        assert_eq!(
4452            fs::read(&data_path).expect("read semantic.bin"),
4453            legacy_bytes
4454        );
4455
4456        let loaded = SemanticIndex::read_from_disk(
4457            storage.path(),
4458            "proj",
4459            &project_root,
4460            false,
4461            Some(&fingerprint.as_string()),
4462        )
4463        .expect("load semantic index");
4464        assert_eq!(loaded.entries.len(), index.entries.len());
4465        assert_eq!(loaded.dimension, index.dimension);
4466        assert_eq!(
4467            loaded.fingerprint().unwrap().as_string(),
4468            fingerprint.as_string()
4469        );
4470        assert_eq!(loaded.file_mtimes.get(&file), Some(&mtime));
4471        assert_eq!(loaded.file_sizes.get(&file), Some(&42));
4472        assert_eq!(
4473            loaded.file_hashes.get(&file),
4474            Some(&cache_freshness::zero_hash())
4475        );
4476        for (actual, expected) in loaded.entries.iter().zip(index.entries.iter()) {
4477            assert_eq!(actual.chunk.file, expected.chunk.file);
4478            assert_eq!(actual.chunk.name, expected.chunk.name);
4479            assert_eq!(actual.chunk.kind, expected.chunk.kind);
4480            assert_eq!(actual.chunk.start_line, expected.chunk.start_line);
4481            assert_eq!(actual.chunk.end_line, expected.chunk.end_line);
4482            assert_eq!(actual.chunk.exported, expected.chunk.exported);
4483            assert_eq!(actual.chunk.embed_text, expected.chunk.embed_text);
4484            assert_eq!(actual.chunk.snippet, expected.chunk.snippet);
4485            assert_eq!(actual.vector, expected.vector);
4486        }
4487        assert_eq!(loaded.to_bytes(), legacy_bytes);
4488    }
4489
4490    #[test]
4491    fn symbol_kind_serialization_roundtrip_includes_file_summary_variant() {
4492        let cases = [
4493            (SymbolKind::Function, 0),
4494            (SymbolKind::Class, 1),
4495            (SymbolKind::Method, 2),
4496            (SymbolKind::Struct, 3),
4497            (SymbolKind::Interface, 4),
4498            (SymbolKind::Enum, 5),
4499            (SymbolKind::TypeAlias, 6),
4500            (SymbolKind::Variable, 7),
4501            (SymbolKind::Heading, 8),
4502            (SymbolKind::FileSummary, 9),
4503        ];
4504
4505        for (kind, encoded) in cases {
4506            assert_eq!(symbol_kind_to_u8(&kind), encoded);
4507            assert_eq!(u8_to_symbol_kind(encoded), kind);
4508        }
4509    }
4510
4511    #[test]
4512    fn test_search_top_k() {
4513        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4514        index.dimension = 3;
4515
4516        // Add entries with known vectors
4517        for (i, name) in ["auth", "database", "handler"].iter().enumerate() {
4518            let mut vec = vec![0.0f32; 3];
4519            vec[i] = 1.0; // orthogonal vectors
4520            index.entries.push(EmbeddingEntry {
4521                chunk: SemanticChunk {
4522                    file: PathBuf::from("/src/lib.rs"),
4523                    name: name.to_string(),
4524                    kind: SymbolKind::Function,
4525                    start_line: (i * 10 + 1) as u32,
4526                    end_line: (i * 10 + 5) as u32,
4527                    exported: true,
4528                    embed_text: format!("kind:function name:{}", name),
4529                    snippet: format!("fn {}() {{}}", name),
4530                },
4531                vector: vec,
4532            });
4533        }
4534
4535        // Query aligned with "auth" (index 0)
4536        let query = vec![0.9, 0.1, 0.0];
4537        let results = index.search(&query, 2);
4538
4539        assert_eq!(results.len(), 2);
4540        assert_eq!(results[0].name, "auth"); // highest score
4541        assert!(results[0].score > results[1].score);
4542    }
4543
4544    #[test]
4545    fn test_empty_index_search() {
4546        let index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4547        let results = index.search(&[0.1, 0.2, 0.3], 10);
4548        assert!(results.is_empty());
4549    }
4550
4551    #[test]
4552    fn single_line_symbol_builds_non_empty_snippet() {
4553        let symbol = Symbol {
4554            name: "answer".to_string(),
4555            kind: SymbolKind::Variable,
4556            range: crate::symbols::Range {
4557                start_line: 0,
4558                start_col: 0,
4559                end_line: 0,
4560                end_col: 24,
4561            },
4562            signature: Some("const answer = 42".to_string()),
4563            scope_chain: Vec::new(),
4564            exported: true,
4565            parent: None,
4566        };
4567        let source = "export const answer = 42;\n";
4568
4569        let snippet = build_snippet(&symbol, source);
4570
4571        assert_eq!(snippet, "export const answer = 42;");
4572    }
4573
4574    #[test]
4575    fn optimized_file_chunk_collection_matches_file_parser_path() {
4576        let project_root = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
4577        let file = project_root.join("src/semantic_index.rs");
4578        let source = std::fs::read_to_string(&file).unwrap();
4579
4580        let mut legacy_parser = FileParser::new();
4581        let legacy_symbols = legacy_parser.extract_symbols(&file).unwrap();
4582        let legacy_chunks = symbols_to_chunks(&file, &legacy_symbols, &source, &project_root);
4583
4584        let mut parsers = HashMap::new();
4585        let optimized_chunks = collect_file_chunks(&project_root, &file, &mut parsers).unwrap();
4586
4587        assert_eq!(
4588            chunk_fingerprint(&optimized_chunks),
4589            chunk_fingerprint(&legacy_chunks)
4590        );
4591    }
4592
4593    fn chunk_fingerprint(
4594        chunks: &[SemanticChunk],
4595    ) -> Vec<(String, SymbolKind, u32, u32, bool, String, String)> {
4596        chunks
4597            .iter()
4598            .map(|chunk| {
4599                (
4600                    chunk.name.clone(),
4601                    chunk.kind.clone(),
4602                    chunk.start_line,
4603                    chunk.end_line,
4604                    chunk.exported,
4605                    chunk.embed_text.clone(),
4606                    chunk.snippet.clone(),
4607                )
4608            })
4609            .collect()
4610    }
4611
4612    #[test]
4613    fn collect_file_chunks_skips_oversized_file() {
4614        let dir = tempfile::tempdir().unwrap();
4615        let big = dir.path().join("huge.ts");
4616        // Just over the cap: a valid TS file that would otherwise yield chunks.
4617        let filler = "export const x = 1;\n"
4618            .repeat(((MAX_SEMANTIC_FILE_BYTES as usize) / "export const x = 1;\n".len()) + 16);
4619        std::fs::write(&big, &filler).unwrap();
4620        assert!(big.metadata().unwrap().len() > MAX_SEMANTIC_FILE_BYTES);
4621
4622        let mut parsers = HashMap::new();
4623        // Oversized → tracked with zero chunks, NOT an error (so the caller keeps
4624        // the file in metadata and freshness skips re-reading it).
4625        let chunks = collect_file_chunks(dir.path(), &big, &mut parsers).unwrap();
4626        assert!(chunks.is_empty(), "oversized file must yield no chunks");
4627
4628        // A small file of the same language still produces chunks.
4629        let small = dir.path().join("small.ts");
4630        std::fs::write(&small, "export function foo() { return 1; }\n").unwrap();
4631        let small_chunks = collect_file_chunks(dir.path(), &small, &mut parsers).unwrap();
4632        assert!(!small_chunks.is_empty(), "small file should still chunk");
4633    }
4634
4635    #[test]
4636    fn rejects_oversized_dimension_during_deserialization() {
4637        let mut bytes = Vec::new();
4638        bytes.push(1u8);
4639        bytes.extend_from_slice(&((MAX_DIMENSION as u32) + 1).to_le_bytes());
4640        bytes.extend_from_slice(&0u32.to_le_bytes());
4641        bytes.extend_from_slice(&0u32.to_le_bytes());
4642
4643        assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
4644    }
4645
4646    #[test]
4647    fn rejects_oversized_entry_count_during_deserialization() {
4648        let mut bytes = Vec::new();
4649        bytes.push(1u8);
4650        bytes.extend_from_slice(&(DEFAULT_DIMENSION as u32).to_le_bytes());
4651        bytes.extend_from_slice(&((MAX_ENTRIES as u32) + 1).to_le_bytes());
4652        bytes.extend_from_slice(&0u32.to_le_bytes());
4653
4654        assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
4655    }
4656
4657    #[test]
4658    fn invalidate_file_removes_entries_and_mtime() {
4659        let target = PathBuf::from("/src/main.rs");
4660        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4661        index.entries.push(EmbeddingEntry {
4662            chunk: SemanticChunk {
4663                file: target.clone(),
4664                name: "main".to_string(),
4665                kind: SymbolKind::Function,
4666                start_line: 0,
4667                end_line: 1,
4668                exported: false,
4669                embed_text: "main".to_string(),
4670                snippet: "fn main() {}".to_string(),
4671            },
4672            vector: vec![1.0; DEFAULT_DIMENSION],
4673        });
4674        index
4675            .file_mtimes
4676            .insert(target.clone(), SystemTime::UNIX_EPOCH);
4677        index.file_sizes.insert(target.clone(), 0);
4678
4679        index.invalidate_file(&target);
4680
4681        assert!(index.entries.is_empty());
4682        assert!(!index.file_mtimes.contains_key(&target));
4683        assert!(!index.file_sizes.contains_key(&target));
4684    }
4685
4686    #[test]
4687    fn refresh_missing_changed_file_is_purged_after_collect() {
4688        let temp = tempfile::tempdir().unwrap();
4689        let project_root = temp.path();
4690        let file = project_root.join("src/lib.rs");
4691        fs::create_dir_all(file.parent().unwrap()).unwrap();
4692        write_rust_file(&file, "vanished_symbol");
4693
4694        let mut index = build_test_index(project_root, std::slice::from_ref(&file));
4695        let original_size = *index.file_sizes.get(&file).unwrap();
4696        set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, original_size + 1);
4697        fs::remove_file(&file).unwrap();
4698
4699        let mut embed = test_vector_for_texts;
4700        let mut progress = |_done: usize, _total: usize| {};
4701        let summary = index
4702            .refresh_stale_files(
4703                project_root,
4704                std::slice::from_ref(&file),
4705                &mut embed,
4706                8,
4707                &mut progress,
4708            )
4709            .unwrap();
4710
4711        assert_eq!(summary.changed, 0);
4712        assert_eq!(summary.added, 0);
4713        assert_eq!(summary.deleted, 1);
4714        assert!(index.entries.is_empty());
4715        assert!(!index.file_mtimes.contains_key(&file));
4716        assert!(!index.file_sizes.contains_key(&file));
4717        assert!(!index.file_hashes.contains_key(&file));
4718    }
4719
4720    #[test]
4721    fn refresh_collect_error_for_existing_path_preserves_cached_entry() {
4722        let temp = tempfile::tempdir().unwrap();
4723        let project_root = temp.path();
4724        let file = project_root.join("src/lib.rs");
4725        fs::create_dir_all(file.parent().unwrap()).unwrap();
4726        write_rust_file(&file, "kept_symbol");
4727
4728        let mut index = build_test_index(project_root, std::slice::from_ref(&file));
4729        let original_entry_count = index.entries.len();
4730        let original_mtime = *index.file_mtimes.get(&file).unwrap();
4731        let original_size = *index.file_sizes.get(&file).unwrap();
4732
4733        let stale_mtime = SystemTime::UNIX_EPOCH;
4734        set_file_metadata(&mut index, &file, stale_mtime, original_size + 1);
4735        fs::remove_file(&file).unwrap();
4736        fs::create_dir(&file).unwrap();
4737
4738        let mut embed = test_vector_for_texts;
4739        let mut progress = |_done: usize, _total: usize| {};
4740        let summary = index
4741            .refresh_stale_files(
4742                project_root,
4743                std::slice::from_ref(&file),
4744                &mut embed,
4745                8,
4746                &mut progress,
4747            )
4748            .unwrap();
4749
4750        assert_eq!(summary.changed, 0);
4751        assert_eq!(summary.added, 0);
4752        assert_eq!(summary.deleted, 0);
4753        assert_eq!(index.entries.len(), original_entry_count);
4754        assert!(index
4755            .entries
4756            .iter()
4757            .any(|entry| entry.chunk.name == "kept_symbol"));
4758        assert_eq!(index.file_mtimes.get(&file), Some(&stale_mtime));
4759        assert_ne!(index.file_mtimes.get(&file), Some(&original_mtime));
4760        assert_eq!(index.file_sizes.get(&file), Some(&(original_size + 1)));
4761    }
4762
4763    #[test]
4764    fn refresh_never_indexed_file_error_does_not_record_mtime() {
4765        let temp = tempfile::tempdir().unwrap();
4766        let project_root = temp.path();
4767        let missing = project_root.join("src/missing.rs");
4768        fs::create_dir_all(missing.parent().unwrap()).unwrap();
4769
4770        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4771        let mut embed = test_vector_for_texts;
4772        let mut progress = |_done: usize, _total: usize| {};
4773        let summary = index
4774            .refresh_stale_files(
4775                project_root,
4776                std::slice::from_ref(&missing),
4777                &mut embed,
4778                8,
4779                &mut progress,
4780            )
4781            .unwrap();
4782
4783        assert_eq!(summary.added, 0);
4784        assert_eq!(summary.changed, 0);
4785        assert_eq!(summary.deleted, 0);
4786        assert!(!index.file_mtimes.contains_key(&missing));
4787        assert!(!index.file_sizes.contains_key(&missing));
4788        assert!(index.entries.is_empty());
4789    }
4790
4791    #[test]
4792    fn refresh_reports_added_for_new_files() {
4793        let temp = tempfile::tempdir().unwrap();
4794        let project_root = temp.path();
4795        let existing = project_root.join("src/lib.rs");
4796        let added = project_root.join("src/new.rs");
4797        fs::create_dir_all(existing.parent().unwrap()).unwrap();
4798        write_rust_file(&existing, "existing_symbol");
4799        write_rust_file(&added, "added_symbol");
4800
4801        let mut index = build_test_index(project_root, std::slice::from_ref(&existing));
4802        let mut embed = test_vector_for_texts;
4803        let mut progress = |_done: usize, _total: usize| {};
4804        let summary = index
4805            .refresh_stale_files(
4806                project_root,
4807                &[existing.clone(), added.clone()],
4808                &mut embed,
4809                8,
4810                &mut progress,
4811            )
4812            .unwrap();
4813
4814        assert_eq!(summary.added, 1);
4815        assert_eq!(summary.changed, 0);
4816        assert_eq!(summary.deleted, 0);
4817        assert_eq!(summary.total_processed, 2);
4818        assert!(index.file_mtimes.contains_key(&added));
4819        assert!(index.entries.iter().any(|entry| entry.chunk.file == added));
4820    }
4821
4822    #[test]
4823    fn refresh_reports_deleted_for_removed_files() {
4824        let temp = tempfile::tempdir().unwrap();
4825        let project_root = temp.path();
4826        let deleted = project_root.join("src/deleted.rs");
4827        fs::create_dir_all(deleted.parent().unwrap()).unwrap();
4828        write_rust_file(&deleted, "deleted_symbol");
4829
4830        let mut index = build_test_index(project_root, std::slice::from_ref(&deleted));
4831        fs::remove_file(&deleted).unwrap();
4832
4833        let mut embed = test_vector_for_texts;
4834        let mut progress = |_done: usize, _total: usize| {};
4835        let summary = index
4836            .refresh_stale_files(project_root, &[], &mut embed, 8, &mut progress)
4837            .unwrap();
4838
4839        assert_eq!(summary.deleted, 1);
4840        assert_eq!(summary.changed, 0);
4841        assert_eq!(summary.added, 0);
4842        assert_eq!(summary.total_processed, 1);
4843        assert!(!index.file_mtimes.contains_key(&deleted));
4844        assert!(index.entries.is_empty());
4845    }
4846
4847    #[test]
4848    fn refresh_reports_changed_for_modified_files() {
4849        let temp = tempfile::tempdir().unwrap();
4850        let project_root = temp.path();
4851        let file = project_root.join("src/lib.rs");
4852        fs::create_dir_all(file.parent().unwrap()).unwrap();
4853        write_rust_file(&file, "old_symbol");
4854
4855        let mut index = build_test_index(project_root, std::slice::from_ref(&file));
4856        set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, 0);
4857        write_rust_file(&file, "new_symbol");
4858
4859        let mut embed = test_vector_for_texts;
4860        let mut progress = |_done: usize, _total: usize| {};
4861        let summary = index
4862            .refresh_stale_files(
4863                project_root,
4864                std::slice::from_ref(&file),
4865                &mut embed,
4866                8,
4867                &mut progress,
4868            )
4869            .unwrap();
4870
4871        assert_eq!(summary.changed, 1);
4872        assert_eq!(summary.added, 0);
4873        assert_eq!(summary.deleted, 0);
4874        assert_eq!(summary.total_processed, 1);
4875        assert!(index
4876            .entries
4877            .iter()
4878            .any(|entry| entry.chunk.name == "new_symbol"));
4879        assert!(!index
4880            .entries
4881            .iter()
4882            .any(|entry| entry.chunk.name == "old_symbol"));
4883    }
4884
4885    #[test]
4886    fn refresh_all_clean_reports_zero_counts_and_no_embedding_work() {
4887        let temp = tempfile::tempdir().unwrap();
4888        let project_root = temp.path();
4889        let file = project_root.join("src/lib.rs");
4890        fs::create_dir_all(file.parent().unwrap()).unwrap();
4891        write_rust_file(&file, "clean_symbol");
4892
4893        let mut index = build_test_index(project_root, std::slice::from_ref(&file));
4894        let original_entries = index.entries.len();
4895        let mut embed_called = false;
4896        let mut embed = |texts: Vec<String>| {
4897            embed_called = true;
4898            test_vector_for_texts(texts)
4899        };
4900        let mut progress = |_done: usize, _total: usize| {};
4901        let summary = index
4902            .refresh_stale_files(
4903                project_root,
4904                std::slice::from_ref(&file),
4905                &mut embed,
4906                8,
4907                &mut progress,
4908            )
4909            .unwrap();
4910
4911        assert!(summary.is_noop());
4912        assert_eq!(summary.total_processed, 1);
4913        assert!(!embed_called);
4914        assert_eq!(index.entries.len(), original_entries);
4915    }
4916
4917    #[test]
4918    fn detects_missing_onnx_runtime_from_dynamic_load_error() {
4919        let message = "Failed to load ONNX Runtime shared library libonnxruntime.dylib via dlopen: no such file";
4920
4921        assert!(is_onnx_runtime_unavailable(message));
4922    }
4923
4924    #[test]
4925    fn formats_missing_onnx_runtime_with_install_hint() {
4926        let message = format_embedding_init_error(
4927            "Failed to load ONNX Runtime shared library libonnxruntime.so via dlopen: no such file",
4928        );
4929
4930        assert!(message.starts_with("ONNX Runtime not found. Install via:"));
4931        assert!(message.contains("Original error:"));
4932    }
4933
4934    #[test]
4935    fn interactive_query_embedding_model_caps_remote_timeout() {
4936        let mut config = SemanticBackendConfig {
4937            backend: SemanticBackend::OpenAiCompatible,
4938            model: "test-embedding".to_string(),
4939            base_url: Some("http://127.0.0.1:9".to_string()),
4940            api_key_env: None,
4941            timeout_ms: 0,
4942            max_batch_size: 64,
4943            max_files: 20_000,
4944        };
4945
4946        let build_model = SemanticEmbeddingModel::from_config(&config).unwrap();
4947        let query_model = SemanticEmbeddingModel::from_config_for_query(&config).unwrap();
4948        assert_eq!(
4949            build_model.timeout_ms(),
4950            DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS,
4951            "background build keeps the longer default embedding timeout"
4952        );
4953        assert_eq!(
4954            query_model.timeout_ms(),
4955            DEFAULT_QUERY_EMBEDDING_TIMEOUT_MS,
4956            "interactive query embedding is capped below the dispatch transport timeout"
4957        );
4958
4959        config.timeout_ms = 60_000;
4960        let query_model = SemanticEmbeddingModel::from_config_for_query(&config).unwrap();
4961        assert_eq!(
4962            query_model.timeout_ms(),
4963            DEFAULT_QUERY_EMBEDDING_TIMEOUT_MS,
4964            "explicitly long backend timeouts are capped for interactive queries"
4965        );
4966
4967        config.timeout_ms = 3_000;
4968        let query_model = SemanticEmbeddingModel::from_config_for_query(&config).unwrap();
4969        assert_eq!(
4970            query_model.timeout_ms(),
4971            3_000,
4972            "shorter explicit timeouts are respected for interactive queries"
4973        );
4974    }
4975
4976    #[test]
4977    fn openai_compatible_backend_embeds_with_mock_server() {
4978        let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
4979            assert!(request_line.starts_with("POST "));
4980            assert_eq!(path, "/v1/embeddings");
4981            "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0},{\"embedding\":[0.4,0.5,0.6],\"index\":1}]}".to_string()
4982        });
4983
4984        let config = SemanticBackendConfig {
4985            backend: SemanticBackend::OpenAiCompatible,
4986            model: "test-embedding".to_string(),
4987            base_url: Some(base_url),
4988            api_key_env: None,
4989            timeout_ms: 5_000,
4990            max_batch_size: 64,
4991            max_files: 20_000,
4992        };
4993
4994        let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
4995        let vectors = model
4996            .embed(vec!["hello".to_string(), "world".to_string()])
4997            .unwrap();
4998
4999        assert_eq!(vectors, vec![vec![0.1, 0.2, 0.3], vec![0.4, 0.5, 0.6]]);
5000        handle.join().unwrap();
5001    }
5002
5003    /// Regression for issue #36: AFT was sending TWO Content-Type headers
5004    /// on the OpenAI embeddings request — once implicitly via `.json(&body)`
5005    /// and again explicitly via `.header("Content-Type", "application/json")`.
5006    /// reqwest's `.header()` calls `HeaderMap::append`, which produces two
5007    /// headers on the wire. OpenAI's /v1/embeddings endpoint rejects that
5008    /// with `HTTP 400 "you must provide a model parameter"` even though the
5009    /// body actually contains `model`. The fix is to drop the explicit
5010    /// `.header("Content-Type", ...)` call. This test pins that we send
5011    /// exactly one Content-Type header.
5012    #[test]
5013    fn openai_compatible_request_has_single_content_type_header() {
5014        use std::sync::{Arc, Mutex};
5015        let captured: Arc<Mutex<Vec<u8>>> = Arc::new(Mutex::new(Vec::new()));
5016        let captured_for_thread = Arc::clone(&captured);
5017
5018        let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
5019        let addr = listener.local_addr().expect("local addr");
5020        let handle = thread::spawn(move || {
5021            let (mut stream, _) = listener.accept().expect("accept");
5022            let mut buf = Vec::new();
5023            let mut chunk = [0u8; 4096];
5024            let mut header_end = None;
5025            let mut content_length = 0usize;
5026            loop {
5027                let n = stream.read(&mut chunk).expect("read");
5028                if n == 0 {
5029                    break;
5030                }
5031                buf.extend_from_slice(&chunk[..n]);
5032                if header_end.is_none() {
5033                    if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
5034                        header_end = Some(pos + 4);
5035                        for line in String::from_utf8_lossy(&buf[..pos + 4]).lines() {
5036                            if let Some(value) = line.strip_prefix("Content-Length:") {
5037                                content_length = value.trim().parse::<usize>().unwrap_or(0);
5038                            }
5039                        }
5040                    }
5041                }
5042                if let Some(end) = header_end {
5043                    if buf.len() >= end + content_length {
5044                        break;
5045                    }
5046                }
5047            }
5048            *captured_for_thread.lock().unwrap() = buf;
5049            let body = "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0}]}";
5050            let response = format!(
5051                "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
5052                body.len(),
5053                body
5054            );
5055            let _ = stream.write_all(response.as_bytes());
5056        });
5057
5058        let config = SemanticBackendConfig {
5059            backend: SemanticBackend::OpenAiCompatible,
5060            model: "text-embedding-3-small".to_string(),
5061            base_url: Some(format!("http://{}", addr)),
5062            api_key_env: None,
5063            timeout_ms: 5_000,
5064            max_batch_size: 64,
5065            max_files: 20_000,
5066        };
5067        let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
5068        let _ = model.embed(vec!["probe".to_string()]).unwrap();
5069        handle.join().unwrap();
5070
5071        let bytes = captured.lock().unwrap().clone();
5072        let request = String::from_utf8_lossy(&bytes);
5073
5074        // Lowercase line counts because HTTP headers are case-insensitive
5075        // and reqwest may emit `content-type` in lowercase under HTTP/2.
5076        let content_type_lines = request
5077            .lines()
5078            .filter(|line| {
5079                let lower = line.to_ascii_lowercase();
5080                lower.starts_with("content-type:")
5081            })
5082            .count();
5083        assert_eq!(
5084            content_type_lines, 1,
5085            "expected exactly one Content-Type header but found {content_type_lines}; full request:\n{request}",
5086        );
5087
5088        // The body must still include the model field — pin this so a future
5089        // change can't accidentally drop `model` while fixing duplicate headers.
5090        assert!(
5091            request.contains(r#""model":"text-embedding-3-small""#),
5092            "request body should contain model field; full request:\n{request}",
5093        );
5094    }
5095
5096    #[test]
5097    fn ollama_backend_embeds_with_mock_server() {
5098        let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
5099            assert!(request_line.starts_with("POST "));
5100            assert_eq!(path, "/api/embed");
5101            "{\"embeddings\":[[0.7,0.8,0.9],[1.0,1.1,1.2]]}".to_string()
5102        });
5103
5104        let config = SemanticBackendConfig {
5105            backend: SemanticBackend::Ollama,
5106            model: "embeddinggemma".to_string(),
5107            base_url: Some(base_url),
5108            api_key_env: None,
5109            timeout_ms: 5_000,
5110            max_batch_size: 64,
5111            max_files: 20_000,
5112        };
5113
5114        let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
5115        let vectors = model
5116            .embed(vec!["hello".to_string(), "world".to_string()])
5117            .unwrap();
5118
5119        assert_eq!(vectors, vec![vec![0.7, 0.8, 0.9], vec![1.0, 1.1, 1.2]]);
5120        handle.join().unwrap();
5121    }
5122
5123    #[test]
5124    fn read_from_disk_rejects_fingerprint_mismatch() {
5125        let storage = tempfile::tempdir().unwrap();
5126        let project_key = "proj";
5127
5128        let project_root = test_project_root();
5129        let file = project_root.join("src/main.rs");
5130        let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
5131        index.entries.push(EmbeddingEntry {
5132            chunk: SemanticChunk {
5133                file: file.clone(),
5134                name: "handle_request".to_string(),
5135                kind: SymbolKind::Function,
5136                start_line: 10,
5137                end_line: 25,
5138                exported: true,
5139                embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
5140                snippet: "fn handle_request() {}".to_string(),
5141            },
5142            vector: vec![0.1, 0.2, 0.3],
5143        });
5144        index.dimension = 3;
5145        index
5146            .file_mtimes
5147            .insert(file.clone(), SystemTime::UNIX_EPOCH);
5148        index.file_sizes.insert(file, 0);
5149        index.set_fingerprint(SemanticIndexFingerprint {
5150            backend: "openai_compatible".to_string(),
5151            model: "test-embedding".to_string(),
5152            base_url: "http://127.0.0.1:1234/v1".to_string(),
5153            dimension: 3,
5154            chunking_version: default_chunking_version(),
5155        });
5156        index.write_to_disk(storage.path(), project_key);
5157
5158        let matching = index.fingerprint().unwrap().as_string();
5159        assert!(SemanticIndex::read_from_disk(
5160            storage.path(),
5161            project_key,
5162            &project_root,
5163            false,
5164            Some(&matching),
5165        )
5166        .is_some());
5167
5168        let mismatched = SemanticIndexFingerprint {
5169            backend: "ollama".to_string(),
5170            model: "embeddinggemma".to_string(),
5171            base_url: "http://127.0.0.1:11434".to_string(),
5172            dimension: 3,
5173            chunking_version: default_chunking_version(),
5174        }
5175        .as_string();
5176        assert!(SemanticIndex::read_from_disk(
5177            storage.path(),
5178            project_key,
5179            &project_root,
5180            false,
5181            Some(&mismatched),
5182        )
5183        .is_none());
5184    }
5185
5186    #[test]
5187    fn read_from_disk_rejects_v3_cache_for_snippet_rebuild() {
5188        let storage = tempfile::tempdir().unwrap();
5189        let project_key = "proj-v3";
5190        let dir = storage.path().join("semantic").join(project_key);
5191        fs::create_dir_all(&dir).unwrap();
5192
5193        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
5194        index.entries.push(EmbeddingEntry {
5195            chunk: SemanticChunk {
5196                file: PathBuf::from("/src/main.rs"),
5197                name: "handle_request".to_string(),
5198                kind: SymbolKind::Function,
5199                start_line: 0,
5200                end_line: 0,
5201                exported: true,
5202                embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
5203                snippet: "fn handle_request() {}".to_string(),
5204            },
5205            vector: vec![0.1, 0.2, 0.3],
5206        });
5207        index.dimension = 3;
5208        index
5209            .file_mtimes
5210            .insert(PathBuf::from("/src/main.rs"), SystemTime::UNIX_EPOCH);
5211        index.file_sizes.insert(PathBuf::from("/src/main.rs"), 0);
5212        let fingerprint = SemanticIndexFingerprint {
5213            backend: "fastembed".to_string(),
5214            model: "test".to_string(),
5215            base_url: FALLBACK_BACKEND.to_string(),
5216            dimension: 3,
5217            chunking_version: default_chunking_version(),
5218        };
5219        index.set_fingerprint(fingerprint.clone());
5220
5221        let mut bytes = index.to_bytes();
5222        bytes[0] = SEMANTIC_INDEX_VERSION_V3;
5223        fs::write(dir.join("semantic.bin"), bytes).unwrap();
5224
5225        assert!(SemanticIndex::read_from_disk(
5226            storage.path(),
5227            project_key,
5228            &test_project_root(),
5229            false,
5230            Some(&fingerprint.as_string())
5231        )
5232        .is_none());
5233        assert!(!dir.join("semantic.bin").exists());
5234    }
5235
5236    fn make_symbol(kind: SymbolKind, name: &str, start: u32, end: u32) -> crate::symbols::Symbol {
5237        crate::symbols::Symbol {
5238            name: name.to_string(),
5239            kind,
5240            range: crate::symbols::Range {
5241                start_line: start,
5242                start_col: 0,
5243                end_line: end,
5244                end_col: 0,
5245            },
5246            signature: None,
5247            scope_chain: Vec::new(),
5248            exported: false,
5249            parent: None,
5250        }
5251    }
5252
5253    /// Heading symbols (Markdown / HTML headings) must NOT be indexed —
5254    /// they overwhelmingly dominated semantic results even on code-shaped
5255    /// queries because heading prose embeds far more strongly than code
5256    /// chunks. Skipping headings keeps aft_search a code-finder.
5257    #[test]
5258    fn symbols_to_chunks_skips_heading_symbols() {
5259        let project_root = PathBuf::from("/proj");
5260        let file = project_root.join("README.md");
5261        let source = "# Title\n\nbody text\n\n## Section\n\nmore text\n";
5262
5263        let symbols = vec![
5264            make_symbol(SymbolKind::Heading, "Title", 0, 2),
5265            make_symbol(SymbolKind::Heading, "Section", 4, 6),
5266        ];
5267
5268        let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
5269        assert!(
5270            chunks.is_empty(),
5271            "Heading symbols must be filtered out before embedding; got {} chunk(s)",
5272            chunks.len()
5273        );
5274    }
5275
5276    /// A symbol with an enormous signature (e.g. a YAML/Kubernetes CronJob
5277    /// whose inline `command:` script is parsed into the signature) must not
5278    /// produce an embed_text that overflows the embedding backend's physical
5279    /// batch. Before the clamp, the unbounded `signature:` append created a
5280    /// multi-KB input that aborted the whole index build and degraded every
5281    /// search to lexical-only.
5282    #[test]
5283    fn build_embed_text_clamps_oversized_signature() {
5284        let project_root = PathBuf::from("/proj");
5285        let file = project_root.join("cronjob.yaml");
5286        let huge_sig = "kubectl ".repeat(2000); // ~16 KB
5287        let source = "apiVersion: batch/v1\nkind: CronJob\n";
5288
5289        let mut symbol = make_symbol(SymbolKind::Class, "cluster-janitor", 0, 1);
5290        symbol.signature = Some(huge_sig);
5291
5292        let text = build_embed_text(&symbol, source, &file, &project_root);
5293        assert!(
5294            text.chars().count() <= MAX_EMBED_TEXT_CHARS,
5295            "embed_text must be clamped to {} chars, got {}",
5296            MAX_EMBED_TEXT_CHARS,
5297            text.chars().count()
5298        );
5299    }
5300
5301    /// Code symbols (functions, classes, methods, structs, etc.) must still
5302    /// be indexed alongside the heading skip — otherwise we'd starve the
5303    /// index entirely.
5304    #[test]
5305    fn symbols_to_chunks_keeps_code_symbols_alongside_skipped_headings() {
5306        let project_root = PathBuf::from("/proj");
5307        let file = project_root.join("src/lib.rs");
5308        let source = "pub fn handle_request() -> bool {\n    true\n}\n";
5309
5310        let symbols = vec![
5311            // A heading mixed in (e.g. from a doc comment block elsewhere).
5312            make_symbol(SymbolKind::Heading, "doc heading", 0, 1),
5313            make_symbol(SymbolKind::Function, "handle_request", 0, 2),
5314            make_symbol(SymbolKind::Struct, "AuthService", 4, 6),
5315        ];
5316
5317        let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
5318        assert_eq!(
5319            chunks.len(),
5320            3,
5321            "Expected file-summary + 2 code chunks (Function + Struct), got {}",
5322            chunks.len()
5323        );
5324        let names: Vec<&str> = chunks.iter().map(|c| c.name.as_str()).collect();
5325        assert!(chunks
5326            .iter()
5327            .any(|chunk| matches!(chunk.kind, SymbolKind::FileSummary)));
5328        assert!(names.contains(&"handle_request"));
5329        assert!(names.contains(&"AuthService"));
5330        assert!(
5331            !names.contains(&"doc heading"),
5332            "Heading symbol leaked into chunks: {names:?}"
5333        );
5334    }
5335
5336    #[test]
5337    fn validate_ssrf_allows_loopback_hostnames() {
5338        // Loopback hostnames are explicitly allowed so self-hosted backends
5339        // (Ollama at http://localhost:11434) work at their default config.
5340        for host in &[
5341            "http://localhost",
5342            "http://localhost:8080",
5343            "http://localhost:11434", // Ollama default
5344            "http://localhost.localdomain",
5345            "http://foo.localhost",
5346        ] {
5347            assert!(
5348                validate_base_url_no_ssrf(host).is_ok(),
5349                "Expected {host} to be allowed (loopback), got: {:?}",
5350                validate_base_url_no_ssrf(host)
5351            );
5352        }
5353    }
5354
5355    #[test]
5356    fn validate_ssrf_allows_loopback_ips() {
5357        // 127.0.0.0/8 is loopback — by definition same-machine and not an
5358        // SSRF target. Allow it so Ollama at http://127.0.0.1:11434 works.
5359        for url in &[
5360            "http://127.0.0.1",
5361            "http://127.0.0.1:11434", // Ollama default
5362            "http://127.0.0.1:8080",
5363            "http://127.1.2.3",
5364        ] {
5365            let result = validate_base_url_no_ssrf(url);
5366            assert!(
5367                result.is_ok(),
5368                "Expected {url} to be allowed (loopback), got: {:?}",
5369                result
5370            );
5371        }
5372    }
5373
5374    #[test]
5375    fn validate_ssrf_rejects_private_non_loopback_ips() {
5376        // Non-loopback private/reserved IPs remain rejected — homelab/intranet
5377        // services on LAN IPs are real SSRF targets even though the user
5378        // configured them. Users who want this can opt in by binding the
5379        // service to a public-routable address.
5380        for url in &[
5381            "http://192.168.1.1",
5382            "http://10.0.0.1",
5383            "http://172.16.0.1",
5384            "http://169.254.169.254",
5385            "http://100.64.0.1",
5386        ] {
5387            let result = validate_base_url_no_ssrf(url);
5388            assert!(
5389                result.is_err(),
5390                "Expected {url} to be rejected (non-loopback private), got: {:?}",
5391                result
5392            );
5393        }
5394    }
5395
5396    #[test]
5397    fn validate_ssrf_rejects_mdns_local_hostnames() {
5398        // mDNS .local hostnames typically resolve to LAN devices, not
5399        // loopback. Rejecting them before DNS lookup gives a clearer error.
5400        for host in &[
5401            "http://printer.local",
5402            "http://nas.local:8080",
5403            "http://homelab.local",
5404        ] {
5405            let result = validate_base_url_no_ssrf(host);
5406            assert!(
5407                result.is_err(),
5408                "Expected {host} to be rejected (mDNS), got: {:?}",
5409                result
5410            );
5411        }
5412    }
5413
5414    #[test]
5415    fn normalize_base_url_allows_localhost_for_tests() {
5416        // normalize_base_url itself should NOT block localhost — only
5417        // validate_base_url_no_ssrf does. Tests construct backends directly.
5418        assert!(normalize_base_url("http://127.0.0.1:9999").is_ok());
5419        assert!(normalize_base_url("http://localhost:8080").is_ok());
5420    }
5421
5422    #[test]
5423    fn ssrf_guard_blocks_reserved_ranges_but_allows_loopback() {
5424        use std::net::IpAddr;
5425        let blocked = |s: &str| is_private_non_loopback_ip(&s.parse::<IpAddr>().unwrap());
5426
5427        // Private / link-local / CGNAT — blocked (unchanged behavior).
5428        assert!(blocked("10.0.0.1"));
5429        assert!(blocked("192.168.1.1"));
5430        assert!(blocked("169.254.0.1"));
5431        assert!(blocked("100.64.0.1"));
5432        // Newly covered by delegating to url_fetch's complete list:
5433        assert!(
5434            blocked("198.18.0.1"),
5435            "RFC2544 benchmark range must be blocked"
5436        );
5437        assert!(blocked("224.0.0.1"), "multicast must be blocked");
5438        assert!(blocked("fc00::1"), "IPv6 ULA must be blocked");
5439        assert!(blocked("fe80::1"), "IPv6 link-local must be blocked");
5440
5441        // Loopback — allowed (local Ollama endpoint), incl. IPv4-mapped form.
5442        assert!(!blocked("127.0.0.1"), "loopback must stay allowed");
5443        assert!(!blocked("::1"), "IPv6 loopback must stay allowed");
5444        assert!(
5445            !blocked("::ffff:127.0.0.1"),
5446            "IPv4-mapped loopback must stay allowed (matches prior carve-out)"
5447        );
5448
5449        // A public address must NOT be flagged.
5450        assert!(!blocked("8.8.8.8"));
5451    }
5452
5453    /// Pin the user-facing wording of the ONNX version-mismatch error.
5454    /// The auto-fix path MUST be listed first because it's the only safe
5455    /// option that doesn't require sudo or risk breaking other apps that
5456    /// link the system library. Regression of any of these strings would
5457    /// either mislead users (system rm before auto-fix) or break the
5458    /// `aft doctor --fix` discovery path.
5459    #[test]
5460    fn ort_mismatch_message_recommends_auto_fix_first() {
5461        let msg =
5462            format_ort_version_mismatch("1.9.0", "/usr/lib/x86_64-linux-gnu/libonnxruntime.so");
5463
5464        // The reported version and path must appear verbatim.
5465        assert!(
5466            msg.contains("v1.9.0"),
5467            "should report detected version: {msg}"
5468        );
5469        assert!(
5470            msg.contains("/usr/lib/x86_64-linux-gnu/libonnxruntime.so"),
5471            "should report system path: {msg}"
5472        );
5473        assert!(msg.contains("v1.20+"), "should state requirement: {msg}");
5474
5475        // Solution ordering: auto-fix is #1, system rm is #2, install is #3.
5476        let auto_fix_pos = msg
5477            .find("Auto-fix")
5478            .expect("Auto-fix solution missing — users won't discover --fix");
5479        let remove_pos = msg
5480            .find("Remove the old library")
5481            .expect("system-rm solution missing");
5482        assert!(
5483            auto_fix_pos < remove_pos,
5484            "Auto-fix must come before manual rm — see PR comment thread"
5485        );
5486
5487        // The auto-fix command must be runnable as-is on a fresh system.
5488        assert!(
5489            msg.contains("npx @cortexkit/aft doctor --fix"),
5490            "auto-fix command must be present and copy-pasteable: {msg}"
5491        );
5492    }
5493
5494    #[cfg(any(target_os = "linux", target_os = "macos"))]
5495    #[test]
5496    fn loaded_ort_version_detection_prefers_actual_loaded_library_path() {
5497        let requested = "libonnxruntime.so";
5498        let actual = "/usr/local/lib/libonnxruntime.so.1.19.0";
5499
5500        assert_eq!(detect_ort_version_from_path(requested), None);
5501        let (version, source) =
5502            detect_ort_version_from_resolved_or_requested(Some(actual.to_string()), requested);
5503
5504        assert_eq!(version, Some("1.19.0".to_string()));
5505        assert_eq!(source, actual);
5506
5507        let msg = format_ort_version_mismatch(&version.unwrap(), &source);
5508        assert!(msg.contains("v1.19.0"));
5509        assert!(msg.contains(actual));
5510    }
5511
5512    /// macOS dylib paths must not produce a malformed message when the
5513    /// system path lacks a trailing slash. This is a regression guard
5514    /// for the "{}\n{}" format string contract.
5515    #[test]
5516    fn ort_mismatch_message_handles_macos_dylib_path() {
5517        let msg = format_ort_version_mismatch("1.9.0", "/opt/homebrew/lib/libonnxruntime.dylib");
5518        assert!(msg.contains("v1.9.0"));
5519        assert!(msg.contains("/opt/homebrew/lib/libonnxruntime.dylib"));
5520        // The dylib path must appear in the auto-fix paragraph (single
5521        // quotes around it) AND in the manual-rm paragraph; verify
5522        // both placements survived the format string.
5523        assert!(
5524            msg.contains("'/opt/homebrew/lib/libonnxruntime.dylib'"),
5525            "system path should be quoted in the auto-fix sentence: {msg}"
5526        );
5527    }
5528}