Skip to main content

aft/
semantic_index.rs

1use crate::cache_freshness::{self, FileFreshness, FreshnessVerdict};
2use crate::config::{SemanticBackend, SemanticBackendConfig};
3use crate::fs_lock;
4use crate::parser::{detect_language, extract_symbols_from_tree, grammar_for};
5use crate::search_index::{cache_relative_path, cached_path_under_root};
6use crate::symbols::{Symbol, SymbolKind};
7use crate::{slog_info, slog_warn};
8
9use crate::local_embed::LocalEmbedder;
10use rayon::prelude::*;
11use reqwest::blocking::Client;
12use serde::{Deserialize, Serialize};
13use std::collections::{HashMap, HashSet, VecDeque};
14use std::env;
15use std::fmt::Display;
16use std::fs;
17use std::io::{self, BufReader, BufWriter, Cursor, Read, Write};
18use std::path::{Path, PathBuf};
19use std::sync::Mutex;
20use std::time::Duration;
21use std::time::SystemTime;
22use tree_sitter::Parser;
23use url::Url;
24
25const DEFAULT_DIMENSION: usize = 384;
26const MAX_ENTRIES: usize = 1_000_000;
27// Covers high-dimensional backends such as OpenAI text-embedding-3-large (3072)
28// and common local models (4096) while keeping a bounded supported shape.
29const MAX_DIMENSION: usize = 4096;
30const F32_BYTES: usize = std::mem::size_of::<f32>();
31const HEADER_BYTES_V1: usize = 9;
32const HEADER_BYTES_V2: usize = 13;
33const ONNX_RUNTIME_INSTALL_HINT: &str =
34    "ONNX Runtime not found. Install via: brew install onnxruntime (macOS), \
35     apt install libonnxruntime (Linux), or place onnxruntime.dll in your PATH (Windows). \
36     AFT can auto-download ONNX Runtime — run `npx @cortexkit/aft doctor` to diagnose.";
37
38const SEMANTIC_INDEX_VERSION_V1: u8 = 1;
39const SEMANTIC_INDEX_VERSION_V2: u8 = 2;
40/// V3 adds subsec_nanos to the file-mtime table so staleness detection survives
41/// restart round-trips on filesystems with subsecond mtime precision (APFS,
42/// ext4 with nsec, NTFS). V1/V2 persisted whole-second mtimes only, which
43/// caused every restart to flag ~99% of files as stale and re-embed them.
44const SEMANTIC_INDEX_VERSION_V3: u8 = 3;
45/// V4 keeps the V3 on-disk layout but rebuilds persisted snippets once after
46/// fixing symbol ranges that were incorrectly treated as 1-based.
47const SEMANTIC_INDEX_VERSION_V4: u8 = 4;
48/// V5 adds file sizes to the file metadata table so incremental staleness
49/// detection can catch content changes even when mtime precision misses them.
50const SEMANTIC_INDEX_VERSION_V5: u8 = 5;
51/// V6 stores paths relative to project_root and adds content hashes.
52const SEMANTIC_INDEX_VERSION_V6: u8 = 6;
53const DEFAULT_OPENAI_EMBEDDING_PATH: &str = "/embeddings";
54const DEFAULT_OLLAMA_EMBEDDING_PATH: &str = "/api/embed";
55// Build/refresh embedding requests keep a larger budget because they run on
56// background workers and often batch many texts through a cold local backend.
57const DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS: u64 = 25_000;
58// Interactive query embedding runs inside semantic_search dispatch; keep it
59// short so slow/unreachable remote backends degrade to lexical quickly.
60const DEFAULT_QUERY_EMBEDDING_TIMEOUT_MS: u64 = 8_000;
61const DEFAULT_MAX_BATCH_SIZE: usize = 64;
62const QUERY_EMBEDDING_CACHE_CAP: usize = 1_000;
63const FALLBACK_BACKEND: &str = "none";
64const EMBEDDING_REQUEST_MAX_ATTEMPTS: usize = 3;
65const EMBEDDING_REQUEST_BACKOFF_MS: [u64; 2] = [500, 1_000];
66static SEMANTIC_LOCK_ACQUIRE_MUTEX: Mutex<()> = Mutex::new(());
67
68pub struct SemanticIndexLock {
69    _guard: fs_lock::LockGuard,
70}
71
72impl SemanticIndexLock {
73    pub fn acquire(storage_dir: &Path, project_key: &str) -> std::io::Result<Self> {
74        let dir = storage_dir.join("semantic").join(project_key);
75        fs::create_dir_all(&dir)?;
76        let path = dir.join("cache.lock");
77        let _acquire_guard = SEMANTIC_LOCK_ACQUIRE_MUTEX
78            .lock()
79            .map_err(|_| std::io::Error::other("semantic cache lock acquisition mutex poisoned"))?;
80        fs_lock::try_acquire(&path, Duration::from_secs(2))
81            .map(|guard| Self { _guard: guard })
82            .map_err(|error| match error {
83                fs_lock::AcquireError::Timeout => {
84                    std::io::Error::other("timed out acquiring semantic cache lock")
85                }
86                fs_lock::AcquireError::Io(error) => error,
87            })
88    }
89}
90
91#[derive(Debug, Clone, Serialize, Deserialize)]
92pub struct SemanticIndexFingerprint {
93    pub backend: String,
94    pub model: String,
95    #[serde(default)]
96    pub base_url: String,
97    pub dimension: usize,
98    #[serde(default = "default_chunking_version")]
99    pub chunking_version: u32,
100}
101
102fn default_chunking_version() -> u32 {
103    2
104}
105
106impl SemanticIndexFingerprint {
107    fn from_config(config: &SemanticBackendConfig, dimension: usize) -> Self {
108        // Use normalized URL for fingerprinting so cosmetic differences
109        // (e.g. "http://host/v1" vs "http://host/v1/") don't cause rebuilds.
110        let base_url = config
111            .base_url
112            .as_ref()
113            .and_then(|u| normalize_base_url(u).ok())
114            .unwrap_or_else(|| FALLBACK_BACKEND.to_string());
115        Self {
116            backend: config.backend.as_str().to_string(),
117            model: config.model.clone(),
118            base_url,
119            dimension,
120            chunking_version: default_chunking_version(),
121        }
122    }
123
124    pub fn as_string(&self) -> String {
125        serde_json::to_string(self).unwrap_or_else(|_| String::new())
126    }
127
128    fn matches_expected(&self, expected: &str) -> bool {
129        let encoded = self.as_string();
130        !encoded.is_empty() && encoded == expected
131    }
132}
133
134enum SemanticEmbeddingEngine {
135    /// Local ONNX embedder (all-MiniLM-L6-v2 via raw `ort`). The config-facing
136    /// backend string stays "fastembed" for index-fingerprint compatibility.
137    Local(LocalEmbedder),
138    OpenAiCompatible {
139        client: Client,
140        model: String,
141        base_url: String,
142        api_key: Option<String>,
143    },
144    Ollama {
145        client: Client,
146        model: String,
147        base_url: String,
148    },
149}
150
151pub struct SemanticEmbeddingModel {
152    backend: SemanticBackend,
153    model: String,
154    base_url: Option<String>,
155    timeout_ms: u64,
156    max_batch_size: usize,
157    dimension: Option<usize>,
158    engine: SemanticEmbeddingEngine,
159    query_embedding_cache: HashMap<String, Vec<f32>>,
160    query_embedding_cache_order: VecDeque<String>,
161    query_embedding_cache_hits: u64,
162    query_embedding_cache_misses: u64,
163}
164
165pub type EmbeddingModel = SemanticEmbeddingModel;
166
167fn validate_embedding_batch(
168    vectors: &[Vec<f32>],
169    expected_count: usize,
170    context: &str,
171) -> Result<(), String> {
172    if expected_count > 0 && vectors.is_empty() {
173        return Err(format!(
174            "{context} returned no vectors for {expected_count} inputs"
175        ));
176    }
177
178    if vectors.len() != expected_count {
179        return Err(format!(
180            "{context} returned {} vectors for {} inputs",
181            vectors.len(),
182            expected_count
183        ));
184    }
185
186    let Some(first_vector) = vectors.first() else {
187        return Ok(());
188    };
189    let expected_dimension = first_vector.len();
190    validate_embedding_dimension(expected_dimension)
191        .map_err(|error| format!("{context} returned {error}"))?;
192    for (index, vector) in vectors.iter().enumerate() {
193        if vector.len() != expected_dimension {
194            return Err(format!(
195                "{context} returned inconsistent embedding dimensions: vector 0 has length {expected_dimension}, vector {index} has length {}",
196                vector.len()
197            ));
198        }
199    }
200
201    Ok(())
202}
203
204fn validate_embedding_dimension(dimension: usize) -> Result<(), String> {
205    if dimension == 0 || dimension > MAX_DIMENSION {
206        return Err(format!(
207            "invalid embedding dimension: {dimension}; supported range is 1..={MAX_DIMENSION}"
208        ));
209    }
210
211    Ok(())
212}
213
214/// Normalize a base URL: validate scheme and strip trailing slash.
215/// Does NOT perform SSRF/private-IP validation — call
216/// `validate_base_url_no_ssrf` separately when processing user-supplied config.
217fn normalize_base_url(raw: &str) -> Result<String, String> {
218    let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
219    let scheme = parsed.scheme();
220    if scheme != "http" && scheme != "https" {
221        return Err(format!(
222            "unsupported URL scheme '{}' — only http:// and https:// are allowed",
223            scheme
224        ));
225    }
226    Ok(parsed.to_string().trim_end_matches('/').to_string())
227}
228
229/// Validate that a base URL does not point to a private/loopback address.
230/// Call this on user-supplied config (at configure time) to prevent SSRF.
231/// Not called for programmatically constructed configs (e.g. tests).
232///
233/// **Loopback is allowed.** Self-hosted embedding backends (e.g. Ollama at
234/// `http://127.0.0.1:11434`) are a primary use case for `aft_search`. Loopback
235/// addresses by definition cannot be exploited as SSRF targets — they only
236/// reach services on the same machine. Allowing loopback unblocks Ollama at its
237/// default config without opening up SSRF to LAN/intranet services, which
238/// remain rejected.
239///
240/// **mDNS `.local` is rejected.** mDNS hostnames typically resolve to LAN
241/// devices (printers, homelab servers); rejecting them before DNS lookup keeps
242/// the SSRF guard meaningful for non-loopback private networks.
243pub fn validate_base_url_no_ssrf(raw: &str) -> Result<(), String> {
244    use std::net::{IpAddr, ToSocketAddrs};
245
246    let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
247
248    let host = parsed.host_str().unwrap_or("");
249
250    // Loopback hostnames are explicitly allowed. RFC 6761 mandates that
251    // `localhost` and `*.localhost` resolve to loopback;
252    // `localhost.localdomain` is a historical alias used on some Linux
253    // distros. Self-hosted backends like Ollama use these by default.
254    let is_loopback_host =
255        host == "localhost" || host == "localhost.localdomain" || host.ends_with(".localhost");
256    if is_loopback_host {
257        return Ok(());
258    }
259
260    // mDNS hostnames are typically LAN devices, not loopback. Reject before
261    // DNS lookup so users get a clear error rather than a private-IP error.
262    if host.ends_with(".local") {
263        return Err(format!(
264            "base_url host '{host}' is an mDNS name — only loopback (localhost / 127.0.0.1) and public endpoints are allowed"
265        ));
266    }
267
268    // Resolve the hostname. Reject private/link-local/CGNAT IPs but NOT
269    // loopback (which is by definition same-machine and not an SSRF target).
270    let port = parsed.port_or_known_default().unwrap_or(443);
271    let addr_str = format!("{host}:{port}");
272    let addrs: Vec<IpAddr> = addr_str
273        .to_socket_addrs()
274        .map(|iter| iter.map(|sa| sa.ip()).collect())
275        .unwrap_or_default();
276    for ip in &addrs {
277        if is_private_non_loopback_ip(ip) {
278            return Err(format!(
279                "base_url '{raw}' resolves to a private/reserved IP — only loopback (127.0.0.1) and public endpoints are allowed"
280            ));
281        }
282    }
283
284    Ok(())
285}
286
287/// Returns true for IPv4/IPv6 addresses in private/link-local/CGNAT/benchmark/
288/// multicast/reserved ranges, EXCLUDING loopback (127.0.0.0/8 and ::1). Loopback
289/// is considered safe for SSRF purposes (same-machine, e.g. a local Ollama
290/// endpoint) — see [`validate_base_url_no_ssrf`] for rationale.
291///
292/// Delegates to [`crate::url_fetch::is_private_or_reserved_ip`] so there is one
293/// authoritative reserved-range list (the url_fetch copy is the maintained one;
294/// this used to be a drifting subset that missed e.g. 198.18.0.0/15 and the
295/// multicast/reserved blocks). We only re-add the loopback carve-out the
296/// url_fetch guard deliberately does not make.
297fn is_private_non_loopback_ip(ip: &std::net::IpAddr) -> bool {
298    // Canonicalize so an IPv4-mapped loopback (`::ffff:127.0.0.1`) is also
299    // recognized as loopback, matching the prior carve-out.
300    if ip.to_canonical().is_loopback() {
301        return false;
302    }
303    crate::url_fetch::is_private_or_reserved_ip(*ip)
304}
305
306fn build_openai_embeddings_endpoint(base_url: &str) -> String {
307    if base_url.ends_with("/v1") {
308        format!("{base_url}{DEFAULT_OPENAI_EMBEDDING_PATH}")
309    } else {
310        format!("{base_url}/v1{}", DEFAULT_OPENAI_EMBEDDING_PATH)
311    }
312}
313
314fn build_ollama_embeddings_endpoint(base_url: &str) -> String {
315    if base_url.ends_with("/api") {
316        format!("{base_url}/embed")
317    } else {
318        format!("{base_url}{DEFAULT_OLLAMA_EMBEDDING_PATH}")
319    }
320}
321
322fn normalize_api_key(value: Option<String>) -> Option<String> {
323    value.and_then(|token| {
324        let token = token.trim();
325        if token.is_empty() {
326            None
327        } else {
328            Some(token.to_string())
329        }
330    })
331}
332
333fn is_retryable_embedding_status(status: reqwest::StatusCode) -> bool {
334    status.is_server_error() || status == reqwest::StatusCode::TOO_MANY_REQUESTS
335}
336
337/// Local backends (LM Studio, Ollama, llama.cpp) can return a 4xx — usually
338/// 400/409 — while a model is loading or was just unloaded. Only narrowly known
339/// local-backend loading/unloaded payloads are classified transient; generic
340/// 4xx bodies that merely mention phrases like "loading model" remain
341/// permanent so misconfigurations do not retry forever.
342fn embedding_response_body_is_transient(status: reqwest::StatusCode, raw: &str) -> bool {
343    if !matches!(
344        status,
345        reqwest::StatusCode::BAD_REQUEST
346            | reqwest::StatusCode::CONFLICT
347            | reqwest::StatusCode::REQUEST_TIMEOUT
348            | reqwest::StatusCode::LOCKED
349            | reqwest::StatusCode::TOO_EARLY
350    ) {
351        return false;
352    }
353
354    let lower = raw.to_ascii_lowercase();
355    let normalized = lower.trim();
356
357    normalized.contains("model was unloaded while the request was still in queue")
358        || normalized == "model is loading"
359        || normalized.starts_with("model is loading,")
360        || normalized.contains(r#""error":"model is loading"#)
361        || normalized.contains(r#""message":"model is loading"#)
362        || normalized == "model not loaded"
363        || normalized.contains(r#""error":"model not loaded""#)
364        || normalized.contains(r#""message":"model not loaded""#)
365        || normalized == "loading model into memory"
366        || normalized.contains(r#""error":"loading model into memory""#)
367        || normalized.contains(r#""message":"loading model into memory""#)
368        || normalized == "model is being loaded"
369        || normalized.contains(r#""error":"model is being loaded""#)
370        || normalized.contains(r#""message":"model is being loaded""#)
371        || normalized == "model is currently loading"
372        || normalized.contains(r#""error":"model is currently loading""#)
373        || normalized.contains(r#""message":"model is currently loading""#)
374}
375
376fn is_retryable_embedding_error(error: &reqwest::Error) -> bool {
377    error.is_connect()
378}
379
380/// Whether a send-time error means the backend is *unreachable or temporarily
381/// failing* (vs. a real misconfiguration). Broader than the in-request retry
382/// predicate: a per-request timeout is transient for the build/refresh layer
383/// (the model may still be cold-loading) but we don't burn the 3 fast
384/// in-request attempts on it — the build-level retry rides it out instead.
385fn embedding_send_error_is_transient(error: &reqwest::Error) -> bool {
386    error.is_connect() || error.is_timeout()
387}
388
389fn embedding_response_read_error_is_transient(error: &reqwest::Error) -> bool {
390    embedding_send_error_is_transient(error) || error.is_body() || error.is_decode()
391}
392
393/// Stable machine marker prefixed onto embedding error strings whose root cause
394/// is transient — the backend is down, timing out, or returning 5xx/429, not
395/// misconfigured. The build and corpus-refresh layers key retry-vs-give-up on
396/// this marker (see [`embedding_failure_is_transient`]) instead of re-parsing
397/// error text, so transience stays authoritative at the one site that knows it.
398/// Stripped before any user-facing display via [`strip_transient_embedding_marker`].
399pub const TRANSIENT_EMBEDDING_MARKER: &str = "[transient] ";
400
401/// True when an embedding error carries the transient marker — i.e. retrying
402/// once the backend recovers is the right move, not surfacing a hard failure.
403pub fn embedding_failure_is_transient(error: &str) -> bool {
404    error.contains(TRANSIENT_EMBEDDING_MARKER)
405}
406
407/// Remove the machine transient marker so the message is clean for display.
408pub fn strip_transient_embedding_marker(error: &str) -> String {
409    error.replace(TRANSIENT_EMBEDDING_MARKER, "")
410}
411
412fn sleep_before_embedding_retry(attempt_index: usize) {
413    if let Some(delay_ms) = EMBEDDING_REQUEST_BACKOFF_MS.get(attempt_index) {
414        std::thread::sleep(Duration::from_millis(*delay_ms));
415    }
416}
417
418fn send_embedding_request<F>(mut make_request: F, backend_label: &str) -> Result<String, String>
419where
420    F: FnMut() -> reqwest::blocking::RequestBuilder,
421{
422    for attempt_index in 0..EMBEDDING_REQUEST_MAX_ATTEMPTS {
423        let last_attempt = attempt_index + 1 == EMBEDDING_REQUEST_MAX_ATTEMPTS;
424
425        let response = match make_request().send() {
426            Ok(response) => response,
427            Err(error) => {
428                if !last_attempt && is_retryable_embedding_error(&error) {
429                    sleep_before_embedding_retry(attempt_index);
430                    continue;
431                }
432                // Connect/timeout failures mean the backend is unreachable or
433                // cold-loading — mark transient so the build layer rides it out
434                // and self-heals instead of parking the index in `Failed`.
435                let marker = if embedding_send_error_is_transient(&error) {
436                    TRANSIENT_EMBEDDING_MARKER
437                } else {
438                    ""
439                };
440                return Err(format!("{marker}{backend_label} request failed: {error}"));
441            }
442        };
443
444        let status = response.status();
445        let raw = match response.text() {
446            Ok(raw) => raw,
447            Err(error) => {
448                if !last_attempt && embedding_response_read_error_is_transient(&error) {
449                    sleep_before_embedding_retry(attempt_index);
450                    continue;
451                }
452                let marker = if embedding_response_read_error_is_transient(&error) {
453                    TRANSIENT_EMBEDDING_MARKER
454                } else {
455                    ""
456                };
457                return Err(format!(
458                    "{marker}{backend_label} response read failed: {error}"
459                ));
460            }
461        };
462
463        if status.is_success() {
464            return Ok(raw);
465        }
466
467        // A 4xx whose body says the model is loading/unloaded is transient on
468        // local backends (LM Studio/Ollama), so treat it like a retryable
469        // status: ride it out at both the in-request and build-retry layers.
470        let body_transient = embedding_response_body_is_transient(status, &raw);
471        if !last_attempt && (is_retryable_embedding_status(status) || body_transient) {
472            sleep_before_embedding_retry(attempt_index);
473            continue;
474        }
475
476        // 5xx / 429 are server-side and transient — the backend is overloaded
477        // or briefly unavailable, not misconfigured. A 4xx whose body indicates
478        // the model is (un)loading is also transient (local backend mid-swap).
479        // Other 4xx (auth, bad request, model-not-found) is a real error the
480        // user must fix; no marker.
481        let marker = if is_retryable_embedding_status(status) || body_transient {
482            TRANSIENT_EMBEDDING_MARKER
483        } else {
484            ""
485        };
486        return Err(format!(
487            "{marker}{backend_label} request failed (HTTP {}): {}",
488            status, raw
489        ));
490    }
491
492    unreachable!("embedding request retries exhausted without returning")
493}
494
495fn configured_embedding_timeout_ms(config: &SemanticBackendConfig) -> u64 {
496    if config.timeout_ms == 0 {
497        DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS
498    } else {
499        config.timeout_ms
500    }
501}
502
503impl SemanticEmbeddingModel {
504    pub fn from_config(config: &SemanticBackendConfig) -> Result<Self, String> {
505        Self::from_config_with_timeout_ms(config, configured_embedding_timeout_ms(config))
506    }
507
508    pub fn from_config_for_query(config: &SemanticBackendConfig) -> Result<Self, String> {
509        let timeout_ms =
510            configured_embedding_timeout_ms(config).min(DEFAULT_QUERY_EMBEDDING_TIMEOUT_MS);
511        Self::from_config_with_timeout_ms(config, timeout_ms)
512    }
513
514    fn from_config_with_timeout_ms(
515        config: &SemanticBackendConfig,
516        timeout_ms: u64,
517    ) -> Result<Self, String> {
518        let max_batch_size = if config.max_batch_size == 0 {
519            DEFAULT_MAX_BATCH_SIZE
520        } else {
521            config.max_batch_size
522        };
523
524        let api_key_env = normalize_api_key(config.api_key_env.clone());
525        let model = config.model.clone();
526
527        let client = Client::builder()
528            .timeout(Duration::from_millis(timeout_ms))
529            .redirect(reqwest::redirect::Policy::none())
530            .build()
531            .map_err(|error| format!("failed to configure embedding client: {error}"))?;
532
533        let engine = match config.backend {
534            SemanticBackend::Fastembed => {
535                SemanticEmbeddingEngine::Local(LocalEmbedder::new(&model)?)
536            }
537            SemanticBackend::OpenAiCompatible => {
538                let raw = config.base_url.as_ref().ok_or_else(|| {
539                    "base_url is required for openai_compatible backend".to_string()
540                })?;
541                let base_url = normalize_base_url(raw)?;
542
543                let api_key = match api_key_env {
544                    Some(var_name) => Some(env::var(&var_name).map_err(|_| {
545                        format!("missing api_key_env '{var_name}' for openai_compatible backend")
546                    })?),
547                    None => None,
548                };
549
550                SemanticEmbeddingEngine::OpenAiCompatible {
551                    client,
552                    model,
553                    base_url,
554                    api_key,
555                }
556            }
557            SemanticBackend::Ollama => {
558                let raw = config
559                    .base_url
560                    .as_ref()
561                    .ok_or_else(|| "base_url is required for ollama backend".to_string())?;
562                let base_url = normalize_base_url(raw)?;
563
564                SemanticEmbeddingEngine::Ollama {
565                    client,
566                    model,
567                    base_url,
568                }
569            }
570        };
571
572        Ok(Self {
573            backend: config.backend,
574            model: config.model.clone(),
575            base_url: config.base_url.clone(),
576            timeout_ms,
577            max_batch_size,
578            dimension: None,
579            engine,
580            query_embedding_cache: HashMap::new(),
581            query_embedding_cache_order: VecDeque::new(),
582            query_embedding_cache_hits: 0,
583            query_embedding_cache_misses: 0,
584        })
585    }
586
587    pub fn backend(&self) -> SemanticBackend {
588        self.backend
589    }
590
591    pub fn model(&self) -> &str {
592        &self.model
593    }
594
595    pub fn base_url(&self) -> Option<&str> {
596        self.base_url.as_deref()
597    }
598
599    pub fn max_batch_size(&self) -> usize {
600        self.max_batch_size
601    }
602
603    pub fn timeout_ms(&self) -> u64 {
604        self.timeout_ms
605    }
606
607    pub fn fingerprint(
608        &mut self,
609        config: &SemanticBackendConfig,
610    ) -> Result<SemanticIndexFingerprint, String> {
611        let dimension = self.dimension()?;
612        Ok(SemanticIndexFingerprint::from_config(config, dimension))
613    }
614
615    pub fn dimension(&mut self) -> Result<usize, String> {
616        if let Some(dimension) = self.dimension {
617            return Ok(dimension);
618        }
619
620        let dimension = match &mut self.engine {
621            SemanticEmbeddingEngine::Local(model) => {
622                let vectors = model.embed(&["semantic index fingerprint probe".to_string()])?;
623                vectors
624                    .first()
625                    .map(|v| v.len())
626                    .ok_or_else(|| "embedding backend returned no vectors".to_string())?
627            }
628            SemanticEmbeddingEngine::OpenAiCompatible { .. } => {
629                let vectors =
630                    self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
631                vectors
632                    .first()
633                    .map(|v| v.len())
634                    .ok_or_else(|| "embedding backend returned no vectors".to_string())?
635            }
636            SemanticEmbeddingEngine::Ollama { .. } => {
637                let vectors =
638                    self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
639                vectors
640                    .first()
641                    .map(|v| v.len())
642                    .ok_or_else(|| "embedding backend returned no vectors".to_string())?
643            }
644        };
645
646        self.dimension = Some(dimension);
647        Ok(dimension)
648    }
649
650    pub fn embed(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
651        self.embed_texts(texts)
652    }
653
654    pub fn embed_query_cached(&mut self, query: &str) -> Result<Vec<f32>, String> {
655        if let Some(vector) = self.query_embedding_cache.get(query) {
656            self.query_embedding_cache_hits += 1;
657            return Ok(vector.clone());
658        }
659
660        self.query_embedding_cache_misses += 1;
661        let embeddings = self.embed_texts(vec![query.to_string()])?;
662        let vector = embeddings
663            .first()
664            .cloned()
665            .ok_or_else(|| "embedding model returned no query vector".to_string())?;
666
667        if self.query_embedding_cache.len() >= QUERY_EMBEDDING_CACHE_CAP {
668            if let Some(oldest) = self.query_embedding_cache_order.pop_front() {
669                self.query_embedding_cache.remove(&oldest);
670            }
671        }
672        self.query_embedding_cache
673            .insert(query.to_string(), vector.clone());
674        self.query_embedding_cache_order
675            .push_back(query.to_string());
676
677        Ok(vector)
678    }
679
680    pub fn query_embedding_cache_stats(&self) -> (u64, u64, usize) {
681        (
682            self.query_embedding_cache_hits,
683            self.query_embedding_cache_misses,
684            self.query_embedding_cache.len(),
685        )
686    }
687
688    fn embed_texts(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
689        match &mut self.engine {
690            SemanticEmbeddingEngine::Local(model) => model
691                .embed(&texts)
692                .map_err(|error| format!("failed to embed batch: {error}")),
693            SemanticEmbeddingEngine::OpenAiCompatible {
694                client,
695                model,
696                base_url,
697                api_key,
698            } => {
699                let expected_text_count = texts.len();
700                let endpoint = build_openai_embeddings_endpoint(base_url);
701                let body = serde_json::json!({
702                    "input": texts,
703                    "model": model,
704                });
705
706                let raw = send_embedding_request(
707                    || {
708                        // `.json(&body)` sets Content-Type: application/json
709                        // automatically. Do NOT add `.header("Content-Type",
710                        // "application/json")` afterwards — RequestBuilder::header()
711                        // calls HeaderMap::append, which produces TWO Content-Type
712                        // headers on the wire. OpenAI's /v1/embeddings endpoint
713                        // treats duplicate Content-Type as malformed and rejects
714                        // the body with 400 "you must provide a model parameter"
715                        // even when `model` is set. Verified end-to-end against
716                        // api.openai.com. See issue #36.
717                        let mut request = client.post(&endpoint).json(&body);
718
719                        if let Some(api_key) = api_key {
720                            request = request.header("Authorization", format!("Bearer {api_key}"));
721                        }
722
723                        request
724                    },
725                    "openai compatible",
726                )?;
727
728                #[derive(Deserialize)]
729                struct OpenAiResponse {
730                    data: Vec<OpenAiEmbeddingResult>,
731                }
732
733                #[derive(Deserialize)]
734                struct OpenAiEmbeddingResult {
735                    embedding: Vec<f32>,
736                    index: Option<u32>,
737                }
738
739                let parsed: OpenAiResponse = serde_json::from_str(&raw)
740                    .map_err(|error| format!("invalid openai compatible response: {error}"))?;
741                if parsed.data.len() != expected_text_count {
742                    return Err(format!(
743                        "openai compatible response returned {} embeddings for {} inputs",
744                        parsed.data.len(),
745                        expected_text_count
746                    ));
747                }
748
749                let mut vectors = vec![Vec::new(); parsed.data.len()];
750                for (i, item) in parsed.data.into_iter().enumerate() {
751                    let index = item.index.unwrap_or(i as u32) as usize;
752                    if index >= vectors.len() {
753                        return Err(
754                            "openai compatible response contains invalid vector index".to_string()
755                        );
756                    }
757                    vectors[index] = item.embedding;
758                }
759
760                for vector in &vectors {
761                    if vector.is_empty() {
762                        return Err(
763                            "openai compatible response contained missing vectors".to_string()
764                        );
765                    }
766                }
767
768                self.dimension = vectors.first().map(Vec::len);
769                Ok(vectors)
770            }
771            SemanticEmbeddingEngine::Ollama {
772                client,
773                model,
774                base_url,
775            } => {
776                let expected_text_count = texts.len();
777                let endpoint = build_ollama_embeddings_endpoint(base_url);
778
779                #[derive(Serialize)]
780                struct OllamaPayload<'a> {
781                    model: &'a str,
782                    input: Vec<String>,
783                }
784
785                let payload = OllamaPayload {
786                    model,
787                    input: texts,
788                };
789
790                let raw = send_embedding_request(
791                    || {
792                        // `.json(&payload)` sets Content-Type automatically.
793                        // Same duplicate-header trap as the OpenAI branch above
794                        // — most Ollama servers tolerate it, but the
795                        // single-Content-Type form is the correct one.
796                        client.post(&endpoint).json(&payload)
797                    },
798                    "ollama",
799                )?;
800
801                #[derive(Deserialize)]
802                struct OllamaResponse {
803                    embeddings: Vec<Vec<f32>>,
804                }
805
806                let parsed: OllamaResponse = serde_json::from_str(&raw)
807                    .map_err(|error| format!("invalid ollama response: {error}"))?;
808                if parsed.embeddings.is_empty() {
809                    return Err("ollama response returned no embeddings".to_string());
810                }
811                if parsed.embeddings.len() != expected_text_count {
812                    return Err(format!(
813                        "ollama response returned {} embeddings for {} inputs",
814                        parsed.embeddings.len(),
815                        expected_text_count
816                    ));
817                }
818
819                let vectors = parsed.embeddings;
820                for vector in &vectors {
821                    if vector.is_empty() {
822                        return Err("ollama response contained empty embeddings".to_string());
823                    }
824                }
825
826                self.dimension = vectors.first().map(Vec::len);
827                Ok(vectors)
828            }
829        }
830    }
831}
832
833/// Pre-validate ONNX Runtime by attempting a raw dlopen before ort touches it.
834/// This catches broken/incompatible .so files without risking a panic in the ort crate.
835/// Also checks the runtime version via OrtGetApiBase if available.
836pub fn pre_validate_onnx_runtime() -> Result<(), String> {
837    let dylib_path = std::env::var("ORT_DYLIB_PATH").ok();
838
839    #[cfg(any(target_os = "linux", target_os = "macos"))]
840    {
841        #[cfg(target_os = "linux")]
842        let default_name = "libonnxruntime.so";
843        #[cfg(target_os = "macos")]
844        let default_name = "libonnxruntime.dylib";
845
846        let lib_name = dylib_path.as_deref().unwrap_or(default_name);
847
848        unsafe {
849            let c_name = std::ffi::CString::new(lib_name)
850                .map_err(|e| format!("invalid library path: {}", e))?;
851            let handle = libc::dlopen(c_name.as_ptr(), libc::RTLD_NOW);
852            if handle.is_null() {
853                let err = libc::dlerror();
854                let msg = if err.is_null() {
855                    "unknown dlopen error".to_string()
856                } else {
857                    std::ffi::CStr::from_ptr(err).to_string_lossy().into_owned()
858                };
859                return Err(format!(
860                    "ONNX Runtime not found. dlopen('{}') failed: {}. \
861                     Run `npx @cortexkit/aft doctor` to diagnose.",
862                    lib_name, msg
863                ));
864            }
865
866            // Try to detect the runtime version from the actual loaded library
867            // path first. A bare dlopen("libonnxruntime.so") may resolve to an
868            // older system ORT through loader search paths; checking only the
869            // caller-supplied soname would miss that and let ort fail opaquely.
870            let (detected_version, version_source) =
871                detect_ort_version_from_loaded_library(handle, lib_name);
872
873            libc::dlclose(handle);
874
875            // Check version compatibility — we need 1.20+.
876            if let Some(ref version) = detected_version {
877                let parts: Vec<&str> = version.split('.').collect();
878                if let (Some(major), Some(minor)) = (
879                    parts.first().and_then(|s| s.parse::<u32>().ok()),
880                    parts.get(1).and_then(|s| s.parse::<u32>().ok()),
881                ) {
882                    if major != 1 || minor < 20 {
883                        return Err(format_ort_version_mismatch(version, &version_source));
884                    }
885                }
886            }
887        }
888    }
889
890    #[cfg(target_os = "windows")]
891    {
892        // Validate ONNX Runtime availability on Windows by loading the DLL
893        // via LoadLibraryExW before the ort crate attempts its own LoadLibrary.
894        // This way we can produce a friendly error (with installation hints)
895        // instead of a raw LoadLibrary failure from deep inside fastembed.
896        let lib_name = dylib_path.as_deref().unwrap_or("onnxruntime.dll");
897
898        // Use kernel32 LoadLibraryExW for the validation — built-in, no
899        // crate dependency required. GetModuleFileNameW resolves the loaded
900        // DLL path for version probing via the version.dll API.
901        #[link(name = "kernel32")]
902        extern "system" {
903            fn LoadLibraryExW(
904                lpLibFileName: *const u16,
905                hFile: *mut std::ffi::c_void,
906                dwFlags: u32,
907            ) -> *mut std::ffi::c_void;
908            fn FreeLibrary(hLibModule: *mut std::ffi::c_void) -> i32;
909            fn GetModuleFileNameW(
910                hModule: *mut std::ffi::c_void,
911                lpFilename: *mut u16,
912                nSize: u32,
913            ) -> u32;
914        }
915
916        #[link(name = "version")]
917        extern "system" {
918            fn GetFileVersionInfoSizeW(lptstrFilename: *const u16, lpdwHandle: *mut u32) -> u32;
919            fn GetFileVersionInfoW(
920                lptstrFilename: *const u16,
921                dwHandle: u32,
922                dwLen: u32,
923                lpData: *mut std::ffi::c_void,
924            ) -> i32;
925            fn VerQueryValueW(
926                pBlock: *mut std::ffi::c_void,
927                lpSubBlock: *const u16,
928                lplpBuffer: *mut *mut std::ffi::c_void,
929                puLen: *mut u32,
930            ) -> i32;
931        }
932
933        #[repr(C)]
934        struct VS_FIXEDFILEINFO {
935            dw_signature: u32,
936            dw_struc_version: u32,
937            dw_file_version_ms: u32, // HIWORD major, LOWORD minor
938            dw_file_version_ls: u32, // HIWORD build, LOWORD revision
939            dw_product_version_ms: u32,
940            dw_product_version_ls: u32,
941            dw_file_flags_mask: u32,
942            dw_file_flags: u32,
943            dw_file_os: u32,
944            dw_file_type: u32,
945            dw_file_subtype: u32,
946            dw_file_date_ms: u32,
947            dw_file_date_ls: u32,
948        }
949
950        unsafe {
951            use std::os::windows::ffi::OsStrExt;
952            let wide: Vec<u16> = std::ffi::OsStr::new(lib_name)
953                .encode_wide()
954                .chain(std::iter::once(0))
955                .collect();
956
957            let handle = LoadLibraryExW(wide.as_ptr(), std::ptr::null_mut(), 0);
958            if handle.is_null() {
959                let err = std::io::Error::last_os_error();
960                return Err(format!(
961                    "ONNX Runtime not found. LoadLibraryExW('{}') failed: {}. \
962                     Run `npx @cortexkit/aft doctor` to diagnose.",
963                    lib_name, err
964                ));
965            }
966
967            // Probe the file version from PE resources so we can reject
968            // outdated DLLs (e.g. v1.9.x) before the ort crate panics.
969            let mut detected_major: u32 = 0;
970            let mut detected_minor: u32 = 0;
971            // Use MAX_UNICODEPATH (32767) so deeply nested ORT paths (e.g.
972            // long NuGet package paths under %USERPROFILE%) never truncate.
973            // GetModuleFileNameW truncates silently when the buffer is too
974            // small, which causes version probing to fail and the version
975            // check to be bypassed — better to allocate generously.
976            let mut path_buf = [0u16; 32767];
977            let path_len = GetModuleFileNameW(handle, path_buf.as_mut_ptr(), 32767);
978            if path_len > 0 {
979                let mut dummy_handle: u32 = 0;
980                let info_size = GetFileVersionInfoSizeW(path_buf.as_ptr(), &mut dummy_handle);
981                if info_size > 0 {
982                    let mut info = vec![0u8; info_size as usize];
983                    if GetFileVersionInfoW(
984                        path_buf.as_ptr(),
985                        0,
986                        info_size,
987                        info.as_mut_ptr() as *mut std::ffi::c_void,
988                    ) != 0
989                    {
990                        let sub_block = "\\\0".encode_utf16().collect::<Vec<u16>>();
991                        let mut vs_info: *mut std::ffi::c_void = std::ptr::null_mut();
992                        let mut vs_len: u32 = 0;
993                        if VerQueryValueW(
994                            info.as_mut_ptr() as *mut std::ffi::c_void,
995                            sub_block.as_ptr(),
996                            &mut vs_info,
997                            &mut vs_len,
998                        ) != 0
999                            && !vs_info.is_null()
1000                        {
1001                            let fixed = vs_info as *const VS_FIXEDFILEINFO;
1002                            detected_major = (*fixed).dw_file_version_ms >> 16;
1003                            detected_minor = (*fixed).dw_file_version_ms & 0xFFFF;
1004                        }
1005                    }
1006                }
1007            }
1008
1009            FreeLibrary(handle);
1010
1011            // Version compatibility check (mirrors the Linux/macOS path).
1012            // If version could not be detected (detected_major == 0) we let
1013            // the load succeed — the ort crate will diagnose further.
1014            if detected_major != 0 && (detected_major != 1 || detected_minor < 20) {
1015                let ver = format!("{}.{}", detected_major, detected_minor);
1016                return Err(format_ort_version_mismatch(&ver, lib_name));
1017            }
1018        }
1019    }
1020
1021    Ok(())
1022}
1023
1024#[cfg(any(target_os = "linux", target_os = "macos"))]
1025unsafe fn loaded_library_path_from_handle(handle: *mut std::ffi::c_void) -> Option<String> {
1026    let symbol_name = std::ffi::CString::new("OrtGetApiBase").ok()?;
1027    let symbol = unsafe { libc::dlsym(handle, symbol_name.as_ptr()) };
1028    if symbol.is_null() {
1029        return None;
1030    }
1031
1032    let mut info = std::mem::MaybeUninit::<libc::Dl_info>::uninit();
1033    if unsafe { libc::dladdr(symbol, info.as_mut_ptr()) } == 0 {
1034        return None;
1035    }
1036
1037    let info = unsafe { info.assume_init() };
1038    if info.dli_fname.is_null() {
1039        return None;
1040    }
1041
1042    Some(
1043        unsafe { std::ffi::CStr::from_ptr(info.dli_fname) }
1044            .to_string_lossy()
1045            .into_owned(),
1046    )
1047}
1048
1049#[cfg(any(target_os = "linux", target_os = "macos"))]
1050fn detect_ort_version_from_resolved_or_requested(
1051    resolved_path: Option<String>,
1052    requested_lib_name: &str,
1053) -> (Option<String>, String) {
1054    if let Some(path) = resolved_path {
1055        if let Some(version) = detect_ort_version_from_path(&path) {
1056            return (Some(version), path);
1057        }
1058        return (detect_ort_version_from_path(requested_lib_name), path);
1059    }
1060
1061    (
1062        detect_ort_version_from_path(requested_lib_name),
1063        requested_lib_name.to_string(),
1064    )
1065}
1066
1067#[cfg(any(target_os = "linux", target_os = "macos"))]
1068fn detect_ort_version_from_loaded_library(
1069    handle: *mut std::ffi::c_void,
1070    requested_lib_name: &str,
1071) -> (Option<String>, String) {
1072    detect_ort_version_from_resolved_or_requested(
1073        unsafe { loaded_library_path_from_handle(handle) },
1074        requested_lib_name,
1075    )
1076}
1077
1078/// Try to extract the ORT version from the library filename or resolved symlink.
1079/// Examples: "libonnxruntime.so.1.19.0" → "1.19.0", "libonnxruntime.1.24.4.dylib" → "1.24.4"
1080#[cfg(any(target_os = "linux", target_os = "macos"))]
1081fn detect_ort_version_from_path(lib_path: &str) -> Option<String> {
1082    let path = std::path::Path::new(lib_path);
1083
1084    // Try the path as given, then follow symlinks
1085    for candidate in [Some(path.to_path_buf()), std::fs::canonicalize(path).ok()]
1086        .into_iter()
1087        .flatten()
1088    {
1089        if let Some(name) = candidate.file_name().and_then(|n| n.to_str()) {
1090            if let Some(version) = extract_version_from_filename(name) {
1091                return Some(version);
1092            }
1093        }
1094    }
1095
1096    // Also check for versioned siblings in the same directory
1097    if let Some(parent) = path.parent() {
1098        if let Ok(entries) = std::fs::read_dir(parent) {
1099            for entry in entries.flatten() {
1100                if let Some(name) = entry.file_name().to_str() {
1101                    if name.starts_with("libonnxruntime") {
1102                        if let Some(version) = extract_version_from_filename(name) {
1103                            return Some(version);
1104                        }
1105                    }
1106                }
1107            }
1108        }
1109    }
1110
1111    None
1112}
1113
1114/// Extract version from filenames like "libonnxruntime.so.1.19.0" or "libonnxruntime.1.24.4.dylib"
1115#[cfg(any(target_os = "linux", target_os = "macos"))]
1116fn extract_version_from_filename(name: &str) -> Option<String> {
1117    // Match patterns: .so.X.Y.Z or .X.Y.Z.dylib or .X.Y.Z.so
1118    let re = regex::Regex::new(r"(\d+\.\d+\.\d+)").ok()?;
1119    re.find(name).map(|m| m.as_str().to_string())
1120}
1121
1122fn suggest_removal_command(lib_path: &str) -> String {
1123    if lib_path.starts_with("/usr/local/lib")
1124        || lib_path == "libonnxruntime.so"
1125        || lib_path == "libonnxruntime.dylib"
1126    {
1127        #[cfg(target_os = "linux")]
1128        return "   sudo rm /usr/local/lib/libonnxruntime* && sudo ldconfig".to_string();
1129        #[cfg(target_os = "macos")]
1130        return "   sudo rm /usr/local/lib/libonnxruntime*".to_string();
1131    }
1132    format!("   rm '{}'", lib_path)
1133}
1134
1135/// Build the user-facing error message for an incompatible ONNX Runtime
1136/// install. Extracted as a pure helper so we can unit-test the wording
1137/// stability — the auto-fix recommendation must always come first because
1138/// it's the only safe option, and the system-rm step must remain present
1139/// because some users prefer the system-wide cleanup path.
1140pub(crate) fn format_ort_version_mismatch(version: &str, lib_name: &str) -> String {
1141    format!(
1142        "ONNX Runtime version mismatch: found v{} at '{}', but AFT requires v1.20+. \
1143         Solutions:\n\
1144         1. Auto-fix (recommended): run `npx @cortexkit/aft doctor --fix`. \
1145         This downloads AFT-managed ONNX Runtime v1.24 into AFT's storage and \
1146         configures the bridge to load it instead of the system library — no \
1147         changes to '{}'.\n\
1148         2. Remove the old library and restart (AFT auto-downloads the correct version on next start):\n\
1149         {}\n\
1150         3. Or install ONNX Runtime 1.24 system-wide: https://github.com/microsoft/onnxruntime/releases/tag/v1.24.0\n\
1151         4. Run `npx @cortexkit/aft doctor` for full diagnostics.",
1152        version,
1153        lib_name,
1154        lib_name,
1155        suggest_removal_command(lib_name),
1156    )
1157}
1158
1159pub fn is_onnx_runtime_unavailable(message: &str) -> bool {
1160    if message.trim_start().starts_with("ONNX Runtime not found.") {
1161        return true;
1162    }
1163
1164    let message = message.to_ascii_lowercase();
1165    let mentions_onnx_runtime = ["onnx runtime", "onnxruntime", "libonnxruntime"]
1166        .iter()
1167        .any(|pattern| message.contains(pattern));
1168    let mentions_dynamic_load_failure = [
1169        "shared library",
1170        "dynamic library",
1171        "failed to load",
1172        "could not load",
1173        "unable to load",
1174        "dlopen",
1175        "loadlibrary",
1176        "no such file",
1177        "not found",
1178    ]
1179    .iter()
1180    .any(|pattern| message.contains(pattern));
1181
1182    mentions_onnx_runtime && mentions_dynamic_load_failure
1183}
1184
1185pub fn format_embedding_init_error(error: impl Display) -> String {
1186    let message = error.to_string();
1187
1188    if is_onnx_runtime_unavailable(&message) {
1189        return format!("{ONNX_RUNTIME_INSTALL_HINT} Original error: {message}");
1190    }
1191
1192    format!("failed to initialize semantic embedding model: {message}")
1193}
1194
1195/// A chunk of code ready for embedding — derived from a Symbol with context enrichment
1196#[derive(Debug, Clone)]
1197pub struct SemanticChunk {
1198    /// Absolute file path
1199    pub file: PathBuf,
1200    /// Symbol name
1201    pub name: String,
1202    /// Symbol kind (function, class, struct, etc.)
1203    pub kind: SymbolKind,
1204    /// Line range (0-based internally, inclusive)
1205    pub start_line: u32,
1206    pub end_line: u32,
1207    /// Whether the symbol is exported
1208    pub exported: bool,
1209    /// The enriched text that gets embedded (scope + signature + body snippet)
1210    pub embed_text: String,
1211    /// Short code snippet for display in results
1212    pub snippet: String,
1213}
1214
1215/// A stored embedding entry — chunk metadata + vector
1216#[derive(Debug, Clone)]
1217pub struct EmbeddingEntry {
1218    chunk: SemanticChunk,
1219    vector: Vec<f32>,
1220}
1221
1222/// The semantic index — stores embeddings for all symbols in a project
1223#[derive(Debug, Clone)]
1224pub struct SemanticIndex {
1225    entries: Vec<EmbeddingEntry>,
1226    /// Track which files are indexed and their mtime for staleness detection
1227    file_mtimes: HashMap<PathBuf, SystemTime>,
1228    /// Track indexed file sizes alongside mtimes for staleness detection
1229    file_sizes: HashMap<PathBuf, u64>,
1230    file_hashes: HashMap<PathBuf, blake3::Hash>,
1231    /// Embedding dimension (384 for MiniLM-L6-v2)
1232    dimension: usize,
1233    fingerprint: Option<SemanticIndexFingerprint>,
1234    project_root: PathBuf,
1235    deferred_files: HashSet<PathBuf>,
1236}
1237
1238#[derive(Debug, Clone, Copy)]
1239struct IndexedFileMetadata {
1240    mtime: SystemTime,
1241    size: u64,
1242    content_hash: blake3::Hash,
1243}
1244
1245/// Result of an incremental refresh of the semantic index. Counts are file
1246/// counts; `total_processed` is the number of current/deleted files considered.
1247#[derive(Debug, Default, Clone, Copy)]
1248pub struct RefreshSummary {
1249    pub changed: usize,
1250    pub added: usize,
1251    pub deleted: usize,
1252    pub total_processed: usize,
1253}
1254
1255impl RefreshSummary {
1256    /// True when no files were touched.
1257    pub fn is_noop(&self) -> bool {
1258        self.changed == 0 && self.added == 0 && self.deleted == 0
1259    }
1260}
1261
1262#[derive(Debug, Default)]
1263pub struct InvalidatedFilesRefresh {
1264    /// Full replacement entries for `completed_paths`, not just newly embedded
1265    /// chunks. `apply_refresh_update` removes completed paths before extending
1266    /// this set, so reused chunks must travel in this delta too.
1267    pub added_entries: Vec<EmbeddingEntry>,
1268    pub updated_metadata: Vec<(PathBuf, FileFreshness)>,
1269    pub completed_paths: Vec<PathBuf>,
1270    pub summary: RefreshSummary,
1271}
1272
1273#[derive(Debug, Clone)]
1274struct ReusableEmbedding {
1275    embed_text: String,
1276    vector: Vec<f32>,
1277}
1278
1279type ChunkReuseMap = HashMap<PathBuf, HashMap<blake3::Hash, Vec<ReusableEmbedding>>>;
1280
1281/// Search result from a semantic query
1282#[derive(Debug, Clone)]
1283pub struct SemanticResult {
1284    pub file: PathBuf,
1285    pub name: String,
1286    pub kind: SymbolKind,
1287    pub start_line: u32,
1288    pub end_line: u32,
1289    pub exported: bool,
1290    pub snippet: String,
1291    pub score: f32,
1292    pub source: &'static str,
1293}
1294
1295impl SemanticIndex {
1296    pub fn new(project_root: PathBuf, dimension: usize) -> Self {
1297        debug_assert!(project_root.is_absolute());
1298        Self {
1299            entries: Vec::new(),
1300            file_mtimes: HashMap::new(),
1301            file_sizes: HashMap::new(),
1302            file_hashes: HashMap::new(),
1303            dimension,
1304            fingerprint: None,
1305            project_root,
1306            deferred_files: HashSet::new(),
1307        }
1308    }
1309
1310    /// Number of embedded symbol entries.
1311    pub fn entry_count(&self) -> usize {
1312        self.entries.len()
1313    }
1314
1315    /// Number of files currently tracked by the semantic index.
1316    pub fn indexed_file_count(&self) -> usize {
1317        self.file_mtimes.len()
1318    }
1319
1320    /// Human-readable status label for the index.
1321    pub fn status_label(&self) -> &'static str {
1322        if self.entries.is_empty() {
1323            "empty"
1324        } else {
1325            "ready"
1326        }
1327    }
1328
1329    fn collect_chunks(
1330        project_root: &Path,
1331        files: &[PathBuf],
1332    ) -> (Vec<SemanticChunk>, HashMap<PathBuf, IndexedFileMetadata>) {
1333        let collect_started = std::time::Instant::now();
1334        let per_file: Vec<(
1335            PathBuf,
1336            Result<(IndexedFileMetadata, Vec<SemanticChunk>), String>,
1337        )> = files
1338            .par_iter()
1339            .map_init(HashMap::new, |parsers, file| {
1340                let result = collect_semantic_file(project_root, file, parsers);
1341                (file.clone(), result)
1342            })
1343            .collect();
1344
1345        let mut chunks: Vec<SemanticChunk> = Vec::new();
1346        let mut file_metadata: HashMap<PathBuf, IndexedFileMetadata> = HashMap::new();
1347
1348        for (file, result) in per_file {
1349            match result {
1350                Ok((metadata, file_chunks)) => {
1351                    file_metadata.insert(file, metadata);
1352                    chunks.extend(file_chunks);
1353                }
1354                Err(error) => {
1355                    // "unsupported file extension" is expected for non-code files
1356                    // (json, xml, .gitignore, etc.) that get included in the
1357                    // project walk. Pre-fix this was swallowed by .unwrap_or_default();
1358                    // we now skip silently to keep the log clean. Only real read/parse
1359                    // errors are worth surfacing.
1360                    if error == "unsupported file extension" {
1361                        continue;
1362                    }
1363                    slog_warn!(
1364                        "failed to collect semantic chunks for {}: {}",
1365                        file.display(),
1366                        error
1367                    );
1368                }
1369            }
1370        }
1371
1372        slog_info!(
1373            "semantic collect: {} chunks from {} files in {} ms",
1374            chunks.len(),
1375            file_metadata.len(),
1376            collect_started.elapsed().as_millis()
1377        );
1378
1379        (chunks, file_metadata)
1380    }
1381
1382    fn build_chunk_reuse_map(&self, files: &[PathBuf]) -> ChunkReuseMap {
1383        let requested: HashSet<&Path> = files.iter().map(PathBuf::as_path).collect();
1384        let mut reuse_map: ChunkReuseMap = HashMap::new();
1385
1386        for entry in &self.entries {
1387            if !requested.contains(entry.chunk.file.as_path()) {
1388                continue;
1389            }
1390
1391            // `embed_text` is already persisted in the current on-disk format,
1392            // so refresh-time reuse can hash it in memory and confirm the exact
1393            // string without bumping `SEMANTIC_INDEX_VERSION` and forcing every
1394            // user through a full rebuild.
1395            let hash = blake3::hash(entry.chunk.embed_text.as_bytes());
1396            reuse_map
1397                .entry(entry.chunk.file.clone())
1398                .or_default()
1399                .entry(hash)
1400                .or_default()
1401                .push(ReusableEmbedding {
1402                    embed_text: entry.chunk.embed_text.clone(),
1403                    vector: entry.vector.clone(),
1404                });
1405        }
1406
1407        reuse_map
1408    }
1409
1410    fn reusable_vector_for_chunk(
1411        reuse_map: &ChunkReuseMap,
1412        chunk: &SemanticChunk,
1413    ) -> Option<Vec<f32>> {
1414        let hash = blake3::hash(chunk.embed_text.as_bytes());
1415        reuse_map
1416            .get(&chunk.file)?
1417            .get(&hash)?
1418            .iter()
1419            .find(|candidate| candidate.embed_text == chunk.embed_text)
1420            .map(|candidate| candidate.vector.clone())
1421    }
1422
1423    fn entries_for_chunks_with_reuse<F, P>(
1424        chunks: Vec<SemanticChunk>,
1425        reuse_map: &ChunkReuseMap,
1426        embed_fn: &mut F,
1427        max_batch_size: usize,
1428        initial_observed_dimension: Option<usize>,
1429        refresh_label: &str,
1430        progress: &mut P,
1431    ) -> Result<(Vec<EmbeddingEntry>, Option<usize>), String>
1432    where
1433        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1434        P: FnMut(usize, usize),
1435    {
1436        let total_chunks = chunks.len();
1437        progress(0, total_chunks);
1438
1439        let mut entries_by_chunk: Vec<Option<EmbeddingEntry>> = vec![None; total_chunks];
1440        let mut misses: Vec<(usize, SemanticChunk)> = Vec::new();
1441
1442        for (chunk_index, chunk) in chunks.into_iter().enumerate() {
1443            if let Some(vector) = Self::reusable_vector_for_chunk(reuse_map, &chunk) {
1444                entries_by_chunk[chunk_index] = Some(EmbeddingEntry { chunk, vector });
1445            } else {
1446                misses.push((chunk_index, chunk));
1447            }
1448        }
1449
1450        let mut completed = total_chunks.saturating_sub(misses.len());
1451        if completed > 0 {
1452            progress(completed, total_chunks);
1453        }
1454
1455        let batch_size = max_batch_size.max(1);
1456        let mut observed_dimension = initial_observed_dimension;
1457
1458        for batch_start in (0..misses.len()).step_by(batch_size) {
1459            let batch_end = (batch_start + batch_size).min(misses.len());
1460            let batch_texts: Vec<String> = misses[batch_start..batch_end]
1461                .iter()
1462                .map(|(_, chunk)| chunk.embed_text.clone())
1463                .collect();
1464
1465            let vectors = embed_fn(batch_texts)?;
1466            validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1467
1468            if let Some(dim) = vectors.first().map(|vector| vector.len()) {
1469                match observed_dimension {
1470                    None => observed_dimension = Some(dim),
1471                    Some(expected) if dim != expected => {
1472                        return Err(format!(
1473                            "embedding dimension changed during {refresh_label}: \
1474                             cached index uses {expected}, new vectors use {dim}"
1475                        ));
1476                    }
1477                    _ => {}
1478                }
1479            }
1480
1481            for (i, vector) in vectors.into_iter().enumerate() {
1482                let (chunk_index, chunk) = misses[batch_start + i].clone();
1483                entries_by_chunk[chunk_index] = Some(EmbeddingEntry { chunk, vector });
1484            }
1485
1486            completed += batch_end - batch_start;
1487            progress(completed, total_chunks);
1488        }
1489
1490        let entries = entries_by_chunk
1491            .into_iter()
1492            .map(|entry| entry.expect("semantic refresh accounted for every chunk"))
1493            .collect();
1494
1495        Ok((entries, observed_dimension))
1496    }
1497
1498    fn build_from_chunks<F, P>(
1499        project_root: &Path,
1500        chunks: Vec<SemanticChunk>,
1501        file_metadata: HashMap<PathBuf, IndexedFileMetadata>,
1502        embed_fn: &mut F,
1503        max_batch_size: usize,
1504        mut progress: Option<&mut P>,
1505    ) -> Result<Self, String>
1506    where
1507        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1508        P: FnMut(usize, usize),
1509    {
1510        debug_assert!(project_root.is_absolute());
1511        let total_chunks = chunks.len();
1512
1513        if chunks.is_empty() {
1514            return Ok(Self {
1515                entries: Vec::new(),
1516                file_mtimes: file_metadata
1517                    .iter()
1518                    .map(|(path, metadata)| (path.clone(), metadata.mtime))
1519                    .collect(),
1520                file_sizes: file_metadata
1521                    .iter()
1522                    .map(|(path, metadata)| (path.clone(), metadata.size))
1523                    .collect(),
1524                file_hashes: file_metadata
1525                    .into_iter()
1526                    .map(|(path, metadata)| (path, metadata.content_hash))
1527                    .collect(),
1528                dimension: DEFAULT_DIMENSION,
1529                fingerprint: None,
1530                project_root: project_root.to_path_buf(),
1531                deferred_files: HashSet::new(),
1532            });
1533        }
1534
1535        // Embed in batches
1536        let mut entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1537        let mut expected_dimension: Option<usize> = None;
1538        let batch_size = max_batch_size.max(1);
1539        let embed_started = std::time::Instant::now();
1540        let batch_count = total_chunks.div_ceil(batch_size);
1541        for batch_start in (0..chunks.len()).step_by(batch_size) {
1542            let batch_end = (batch_start + batch_size).min(chunks.len());
1543            let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1544                .iter()
1545                .map(|c| c.embed_text.clone())
1546                .collect();
1547
1548            let vectors = embed_fn(batch_texts)?;
1549            validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1550
1551            // Track consistent dimension across all batches
1552            if let Some(dim) = vectors.first().map(|v| v.len()) {
1553                match expected_dimension {
1554                    None => expected_dimension = Some(dim),
1555                    Some(expected) if dim != expected => {
1556                        return Err(format!(
1557                            "embedding dimension changed across batches: expected {expected}, got {dim}"
1558                        ));
1559                    }
1560                    _ => {}
1561                }
1562            }
1563
1564            for (i, vector) in vectors.into_iter().enumerate() {
1565                let chunk_idx = batch_start + i;
1566                entries.push(EmbeddingEntry {
1567                    chunk: chunks[chunk_idx].clone(),
1568                    vector,
1569                });
1570            }
1571
1572            if let Some(callback) = progress.as_mut() {
1573                callback(entries.len(), total_chunks);
1574            }
1575        }
1576
1577        let embed_ms = embed_started.elapsed().as_millis();
1578        let rate = (total_chunks as u128 * 1000)
1579            .checked_div(embed_ms)
1580            .unwrap_or(0) as u64;
1581        slog_info!(
1582            "semantic embed: {} chunks in {} batches, {} ms ({} chunks/s)",
1583            total_chunks,
1584            batch_count,
1585            embed_ms,
1586            rate
1587        );
1588
1589        let dimension = entries
1590            .first()
1591            .map(|e| e.vector.len())
1592            .unwrap_or(DEFAULT_DIMENSION);
1593
1594        Ok(Self {
1595            entries,
1596            file_mtimes: file_metadata
1597                .iter()
1598                .map(|(path, metadata)| (path.clone(), metadata.mtime))
1599                .collect(),
1600            file_sizes: file_metadata
1601                .iter()
1602                .map(|(path, metadata)| (path.clone(), metadata.size))
1603                .collect(),
1604            file_hashes: file_metadata
1605                .into_iter()
1606                .map(|(path, metadata)| (path, metadata.content_hash))
1607                .collect(),
1608            dimension,
1609            fingerprint: None,
1610            project_root: project_root.to_path_buf(),
1611            deferred_files: HashSet::new(),
1612        })
1613    }
1614
1615    /// Build the semantic index from a set of files using the provided embedding function.
1616    /// `embed_fn` takes a batch of texts and returns a batch of embedding vectors.
1617    pub fn build<F>(
1618        project_root: &Path,
1619        files: &[PathBuf],
1620        embed_fn: &mut F,
1621        max_batch_size: usize,
1622    ) -> Result<Self, String>
1623    where
1624        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1625    {
1626        let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1627        Self::build_from_chunks(
1628            project_root,
1629            chunks,
1630            file_mtimes,
1631            embed_fn,
1632            max_batch_size,
1633            Option::<&mut fn(usize, usize)>::None,
1634        )
1635    }
1636
1637    /// Build the semantic index and report embedding progress using entry counts.
1638    pub fn build_with_progress<F, P>(
1639        project_root: &Path,
1640        files: &[PathBuf],
1641        embed_fn: &mut F,
1642        max_batch_size: usize,
1643        progress: &mut P,
1644    ) -> Result<Self, String>
1645    where
1646        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1647        P: FnMut(usize, usize),
1648    {
1649        let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1650        let total_chunks = chunks.len();
1651        progress(0, total_chunks);
1652        Self::build_from_chunks(
1653            project_root,
1654            chunks,
1655            file_mtimes,
1656            embed_fn,
1657            max_batch_size,
1658            Some(progress),
1659        )
1660    }
1661
1662    /// Incrementally refresh entries for changed/new files only, preserving cached
1663    /// embeddings for unchanged files. Used when loading the index from disk and
1664    /// finding that a small fraction of files have moved on, deleted, or appeared.
1665    ///
1666    /// Returns `RefreshSummary` describing what changed. On success, `self` is
1667    /// mutated in place and remains a valid index.
1668    ///
1669    /// `current_files` is the full set of files the project considers indexable
1670    /// (typically `walk_project_files(...)`). Files in the cache that are no
1671    /// longer in this set are treated as deleted.
1672    pub fn refresh_stale_files<F, P>(
1673        &mut self,
1674        project_root: &Path,
1675        current_files: &[PathBuf],
1676        embed_fn: &mut F,
1677        max_batch_size: usize,
1678        progress: &mut P,
1679    ) -> Result<RefreshSummary, String>
1680    where
1681        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1682        P: FnMut(usize, usize),
1683    {
1684        self.backfill_missing_file_sizes();
1685
1686        // 1. Bucket files into deleted / changed / added.
1687        let current_set: HashSet<&Path> = current_files.iter().map(PathBuf::as_path).collect();
1688        self.deferred_files
1689            .retain(|path| current_set.contains(path.as_path()));
1690        let total_processed = current_set.len() + self.file_mtimes.len()
1691            - self
1692                .file_mtimes
1693                .keys()
1694                .filter(|path| current_set.contains(path.as_path()))
1695                .count();
1696
1697        // Files in cache that disappeared from disk OR are no longer in the
1698        // walked set. Both cases need their entries dropped.
1699        enum IndexedFileCheck {
1700            Deleted(PathBuf),
1701            MissingMetadata(PathBuf),
1702            Verified(PathBuf, FreshnessVerdict),
1703        }
1704
1705        let mut deleted: Vec<PathBuf> = Vec::new();
1706        let mut changed: Vec<PathBuf> = Vec::new();
1707        let indexed_paths: Vec<PathBuf> = self.file_mtimes.keys().cloned().collect();
1708        let mut checks: Vec<Option<IndexedFileCheck>> = Vec::with_capacity(indexed_paths.len());
1709        let mut strict_verify_inputs: Vec<(usize, PathBuf, FileFreshness)> = Vec::new();
1710
1711        for indexed_path in indexed_paths {
1712            let check_index = checks.len();
1713            if !current_set.contains(indexed_path.as_path()) {
1714                checks.push(Some(IndexedFileCheck::Deleted(indexed_path)));
1715                continue;
1716            }
1717            let cached = match (
1718                self.file_mtimes.get(&indexed_path),
1719                self.file_sizes.get(&indexed_path),
1720                self.file_hashes.get(&indexed_path),
1721            ) {
1722                (Some(mtime), Some(size), Some(hash)) => Some(FileFreshness {
1723                    mtime: *mtime,
1724                    size: *size,
1725                    content_hash: *hash,
1726                }),
1727                _ => None,
1728            };
1729            if let Some(freshness) = cached {
1730                strict_verify_inputs.push((check_index, indexed_path, freshness));
1731                checks.push(None);
1732            } else {
1733                checks.push(Some(IndexedFileCheck::MissingMetadata(indexed_path)));
1734            }
1735        }
1736
1737        for (check_index, path, verdict) in
1738            cache_freshness::verify_files_strict_bounded(strict_verify_inputs)
1739        {
1740            checks[check_index] = Some(IndexedFileCheck::Verified(path, verdict));
1741        }
1742
1743        for check in checks {
1744            match check.expect("strict freshness check should be populated") {
1745                IndexedFileCheck::Deleted(path) => deleted.push(path),
1746                IndexedFileCheck::MissingMetadata(path) => changed.push(path),
1747                IndexedFileCheck::Verified(_path, FreshnessVerdict::HotFresh) => {}
1748                IndexedFileCheck::Verified(
1749                    path,
1750                    FreshnessVerdict::ContentFresh {
1751                        new_mtime,
1752                        new_size,
1753                    },
1754                ) => {
1755                    self.file_mtimes.insert(path.clone(), new_mtime);
1756                    self.file_sizes.insert(path, new_size);
1757                }
1758                IndexedFileCheck::Verified(
1759                    path,
1760                    FreshnessVerdict::Stale | FreshnessVerdict::Deleted,
1761                ) => {
1762                    changed.push(path);
1763                }
1764            }
1765        }
1766
1767        // Files in walk that were never indexed.
1768        let mut added: Vec<PathBuf> = Vec::new();
1769        for path in current_files {
1770            if !self.file_mtimes.contains_key(path) {
1771                added.push(path.clone());
1772            }
1773        }
1774
1775        // Fast path: nothing to do.
1776        if deleted.is_empty() && changed.is_empty() && added.is_empty() {
1777            progress(0, 0);
1778            return Ok(RefreshSummary {
1779                total_processed,
1780                ..RefreshSummary::default()
1781            });
1782        }
1783
1784        // 2. Drop entries for deleted files immediately. Changed files are only
1785        //    replaced after successful re-extraction + embedding so transient
1786        //    read/parse errors keep the stale-but-valid cache entry.
1787        if !deleted.is_empty() {
1788            self.remove_indexed_files(&deleted);
1789        }
1790
1791        // 3. Embed the changed + added set, if any.
1792        let mut to_embed: Vec<PathBuf> = Vec::with_capacity(changed.len() + added.len());
1793        to_embed.extend(changed.iter().cloned());
1794        to_embed.extend(added.iter().cloned());
1795
1796        if to_embed.is_empty() {
1797            // Only deletions happened.
1798            progress(0, 0);
1799            return Ok(RefreshSummary {
1800                changed: 0,
1801                added: 0,
1802                deleted: deleted.len(),
1803                total_processed,
1804            });
1805        }
1806
1807        let reuse_map = self.build_chunk_reuse_map(&changed);
1808        let (chunks, fresh_metadata) = Self::collect_chunks(project_root, &to_embed);
1809        let changed_set: HashSet<&Path> = changed.iter().map(PathBuf::as_path).collect();
1810        let vanished = to_embed
1811            .iter()
1812            .filter(|path| {
1813                changed_set.contains(path.as_path())
1814                    && !fresh_metadata.contains_key(*path)
1815                    && !path.exists()
1816            })
1817            .cloned()
1818            .collect::<Vec<_>>();
1819        if !vanished.is_empty() {
1820            self.remove_indexed_files(&vanished);
1821            deleted.extend(vanished);
1822        }
1823
1824        if chunks.is_empty() {
1825            progress(0, 0);
1826            let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1827            for file in &successful_files {
1828                self.deferred_files.remove(file);
1829            }
1830            if !successful_files.is_empty() {
1831                self.entries
1832                    .retain(|entry| !successful_files.contains(&entry.chunk.file));
1833            }
1834            let changed_count = changed
1835                .iter()
1836                .filter(|path| successful_files.contains(*path))
1837                .count();
1838            let added_count = added
1839                .iter()
1840                .filter(|path| successful_files.contains(*path))
1841                .count();
1842            for (file, metadata) in fresh_metadata {
1843                self.file_mtimes.insert(file.clone(), metadata.mtime);
1844                self.file_sizes.insert(file.clone(), metadata.size);
1845                self.file_hashes.insert(file.clone(), metadata.content_hash);
1846            }
1847            return Ok(RefreshSummary {
1848                changed: changed_count,
1849                added: added_count,
1850                deleted: deleted.len(),
1851                total_processed,
1852            });
1853        }
1854
1855        // 4. Build the full replacement set, reusing cached vectors for chunks
1856        //    whose embed_text is unchanged and embedding only cache misses.
1857        let existing_dimension = if self.entries.is_empty() {
1858            None
1859        } else {
1860            Some(self.dimension)
1861        };
1862        let (new_entries, observed_dimension) = Self::entries_for_chunks_with_reuse(
1863            chunks,
1864            &reuse_map,
1865            embed_fn,
1866            max_batch_size,
1867            existing_dimension,
1868            "incremental refresh",
1869            progress,
1870        )?;
1871
1872        let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1873        for file in &successful_files {
1874            self.deferred_files.remove(file);
1875        }
1876        if !successful_files.is_empty() {
1877            self.entries
1878                .retain(|entry| !successful_files.contains(&entry.chunk.file));
1879        }
1880
1881        self.entries.extend(new_entries);
1882        for (file, metadata) in fresh_metadata {
1883            self.file_mtimes.insert(file.clone(), metadata.mtime);
1884            self.file_sizes.insert(file.clone(), metadata.size);
1885            self.file_hashes.insert(file, metadata.content_hash);
1886        }
1887        if let Some(dim) = observed_dimension {
1888            self.dimension = dim;
1889        }
1890
1891        Ok(RefreshSummary {
1892            changed: changed
1893                .iter()
1894                .filter(|path| successful_files.contains(*path))
1895                .count(),
1896            added: added
1897                .iter()
1898                .filter(|path| successful_files.contains(*path))
1899                .count(),
1900            deleted: deleted.len(),
1901            total_processed,
1902        })
1903    }
1904
1905    /// Refresh exactly the files invalidated by the live watcher, without
1906    /// treating the provided path list as the whole project. This is the
1907    /// watcher-side counterpart to `refresh_stale_files`: it drops any stale
1908    /// entries for the requested paths from this in-memory index, re-extracts
1909    /// whatever still exists on disk, embeds those chunks, and returns the
1910    /// delta needed for another in-memory index to apply the same update.
1911    pub fn refresh_invalidated_files<F, P>(
1912        &mut self,
1913        project_root: &Path,
1914        paths: &[PathBuf],
1915        embed_fn: &mut F,
1916        max_batch_size: usize,
1917        max_files: usize,
1918        progress: &mut P,
1919    ) -> Result<InvalidatedFilesRefresh, String>
1920    where
1921        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1922        P: FnMut(usize, usize),
1923    {
1924        self.backfill_missing_file_sizes();
1925
1926        self.deferred_files.retain(|path| path.exists());
1927        let mut requested_paths = paths.to_vec();
1928        requested_paths.extend(self.deferred_files.iter().cloned());
1929        requested_paths.sort();
1930        requested_paths.dedup();
1931        let total_processed = requested_paths.len();
1932
1933        if requested_paths.is_empty() {
1934            progress(0, 0);
1935            return Ok(InvalidatedFilesRefresh {
1936                summary: RefreshSummary {
1937                    total_processed,
1938                    ..RefreshSummary::default()
1939                },
1940                ..InvalidatedFilesRefresh::default()
1941            });
1942        }
1943
1944        let previously_indexed: HashSet<PathBuf> = requested_paths
1945            .iter()
1946            .filter(|path| self.file_mtimes.contains_key(*path))
1947            .cloned()
1948            .collect();
1949        let reuse_map = self.build_chunk_reuse_map(&requested_paths);
1950
1951        // The watcher path has already invalidated these files in the request
1952        // thread's live index. Mirror that behavior here before inserting any
1953        // fresh chunks so parse/read failures do not resurrect stale entries.
1954        self.remove_indexed_files(&requested_paths);
1955
1956        let existing_paths = requested_paths
1957            .iter()
1958            .filter(|path| path.exists())
1959            .cloned()
1960            .collect::<Vec<_>>();
1961        let deleted = requested_paths
1962            .iter()
1963            .filter(|path| !path.exists() && previously_indexed.contains(path.as_path()))
1964            .count();
1965
1966        if existing_paths.is_empty() {
1967            for path in &requested_paths {
1968                if !path.exists() {
1969                    self.deferred_files.remove(path);
1970                }
1971            }
1972            progress(0, 0);
1973            return Ok(InvalidatedFilesRefresh {
1974                completed_paths: requested_paths,
1975                summary: RefreshSummary {
1976                    deleted,
1977                    total_processed,
1978                    ..RefreshSummary::default()
1979                },
1980                ..InvalidatedFilesRefresh::default()
1981            });
1982        }
1983
1984        let (mut chunks, mut fresh_metadata) = Self::collect_chunks(project_root, &existing_paths);
1985
1986        let retained_file_count = self.file_mtimes.len();
1987        let changed_successful_count = existing_paths
1988            .iter()
1989            .filter(|path| {
1990                previously_indexed.contains(path.as_path()) && fresh_metadata.contains_key(*path)
1991            })
1992            .count();
1993        let available_new_files =
1994            max_files.saturating_sub(retained_file_count.saturating_add(changed_successful_count));
1995        let new_successful_files = existing_paths
1996            .iter()
1997            .filter(|path| {
1998                !previously_indexed.contains(path.as_path()) && fresh_metadata.contains_key(*path)
1999            })
2000            .cloned()
2001            .collect::<Vec<_>>();
2002        if new_successful_files.len() > available_new_files {
2003            let allowed_new_files = new_successful_files
2004                .iter()
2005                .take(available_new_files)
2006                .cloned()
2007                .collect::<HashSet<_>>();
2008            let deferred_new_files = new_successful_files
2009                .into_iter()
2010                .filter(|path| !allowed_new_files.contains(path))
2011                .collect::<HashSet<_>>();
2012
2013            fresh_metadata.retain(|file, _| {
2014                previously_indexed.contains(file.as_path()) || allowed_new_files.contains(file)
2015            });
2016            chunks.retain(|chunk| !deferred_new_files.contains(&chunk.file));
2017
2018            if !deferred_new_files.is_empty() {
2019                for path in &deferred_new_files {
2020                    self.deferred_files.insert(path.clone());
2021                }
2022                slog_warn!(
2023                    "semantic refresh deferred {} new file(s): indexed-file cap {} is reached",
2024                    deferred_new_files.len(),
2025                    max_files
2026                );
2027            }
2028        }
2029
2030        let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
2031        for file in &successful_files {
2032            self.deferred_files.remove(file);
2033        }
2034        let changed = successful_files
2035            .iter()
2036            .filter(|path| previously_indexed.contains(path.as_path()))
2037            .count();
2038        let added = successful_files.len().saturating_sub(changed);
2039        let mut updated_metadata = Vec::with_capacity(fresh_metadata.len());
2040
2041        if chunks.is_empty() {
2042            progress(0, 0);
2043            for (file, metadata) in fresh_metadata {
2044                let freshness = FileFreshness {
2045                    mtime: metadata.mtime,
2046                    size: metadata.size,
2047                    content_hash: metadata.content_hash,
2048                };
2049                self.file_mtimes.insert(file.clone(), freshness.mtime);
2050                self.file_sizes.insert(file.clone(), freshness.size);
2051                self.file_hashes
2052                    .insert(file.clone(), freshness.content_hash);
2053                updated_metadata.push((file, freshness));
2054            }
2055
2056            return Ok(InvalidatedFilesRefresh {
2057                updated_metadata,
2058                completed_paths: requested_paths,
2059                summary: RefreshSummary {
2060                    changed,
2061                    added,
2062                    deleted,
2063                    total_processed,
2064                },
2065                ..InvalidatedFilesRefresh::default()
2066            });
2067        }
2068
2069        let initial_observed_dimension = if self.entries.is_empty() && previously_indexed.is_empty()
2070        {
2071            None
2072        } else {
2073            Some(self.dimension)
2074        };
2075        let (new_entries, observed_dimension) = Self::entries_for_chunks_with_reuse(
2076            chunks,
2077            &reuse_map,
2078            embed_fn,
2079            max_batch_size,
2080            initial_observed_dimension,
2081            "invalidated-file refresh",
2082            progress,
2083        )?;
2084
2085        let added_entries = new_entries.clone();
2086        self.entries.extend(new_entries);
2087        for (file, metadata) in fresh_metadata {
2088            let freshness = FileFreshness {
2089                mtime: metadata.mtime,
2090                size: metadata.size,
2091                content_hash: metadata.content_hash,
2092            };
2093            self.file_mtimes.insert(file.clone(), freshness.mtime);
2094            self.file_sizes.insert(file.clone(), freshness.size);
2095            self.file_hashes
2096                .insert(file.clone(), freshness.content_hash);
2097            updated_metadata.push((file, freshness));
2098        }
2099        if let Some(dim) = observed_dimension {
2100            self.dimension = dim;
2101        }
2102
2103        Ok(InvalidatedFilesRefresh {
2104            added_entries,
2105            updated_metadata,
2106            completed_paths: requested_paths,
2107            summary: RefreshSummary {
2108                changed,
2109                added,
2110                deleted,
2111                total_processed,
2112            },
2113        })
2114    }
2115
2116    pub fn apply_refresh_update(
2117        &mut self,
2118        added_entries: Vec<EmbeddingEntry>,
2119        updated_metadata: Vec<(PathBuf, FileFreshness)>,
2120        completed_paths: &[PathBuf],
2121    ) {
2122        // `added_entries` is the complete replacement set for completed paths:
2123        // freshly embedded misses plus reused chunks carrying refreshed metadata.
2124        // Removing first is safe only because producers include both kinds.
2125        self.remove_indexed_files(completed_paths);
2126
2127        let observed_dimension = added_entries.first().map(|entry| entry.vector.len());
2128        self.entries.extend(added_entries);
2129        for (file, freshness) in updated_metadata {
2130            self.file_mtimes.insert(file.clone(), freshness.mtime);
2131            self.file_sizes.insert(file.clone(), freshness.size);
2132            self.file_hashes.insert(file, freshness.content_hash);
2133        }
2134        if let Some(dim) = observed_dimension {
2135            self.dimension = dim;
2136        }
2137    }
2138
2139    fn remove_indexed_files(&mut self, files: &[PathBuf]) {
2140        let deleted_set: HashSet<&Path> = files.iter().map(PathBuf::as_path).collect();
2141        self.entries
2142            .retain(|entry| !deleted_set.contains(entry.chunk.file.as_path()));
2143        for path in files {
2144            self.file_mtimes.remove(path);
2145            self.file_sizes.remove(path);
2146            self.file_hashes.remove(path);
2147        }
2148    }
2149
2150    /// Search the index with a query embedding, returning top-K results sorted by relevance
2151    pub fn search(&self, query_vector: &[f32], top_k: usize) -> Vec<SemanticResult> {
2152        if self.entries.is_empty() || query_vector.len() != self.dimension {
2153            return Vec::new();
2154        }
2155
2156        let mut scored: Vec<(f32, usize)> = self
2157            .entries
2158            .iter()
2159            .enumerate()
2160            .map(|(i, entry)| {
2161                let mut score = cosine_similarity(query_vector, &entry.vector);
2162                if entry.chunk.exported {
2163                    score *= 1.1;
2164                }
2165                (score, i)
2166            })
2167            .collect();
2168
2169        let keep = top_k.min(scored.len());
2170        if keep == 0 {
2171            return Vec::new();
2172        }
2173
2174        if keep < scored.len() {
2175            scored.select_nth_unstable_by(keep, semantic_score_order);
2176            scored.truncate(keep);
2177        }
2178        scored.sort_by(semantic_score_order);
2179
2180        scored
2181            .into_iter()
2182            // Keep the selected best-first slice mapped without reintroducing the
2183            // old `> 0.0` floor: top_k has already been selected, and zero-score
2184            // tail entries remain observable when requested.
2185            .map(|(score, idx)| {
2186                let entry = &self.entries[idx];
2187                SemanticResult {
2188                    file: entry.chunk.file.clone(),
2189                    name: entry.chunk.name.clone(),
2190                    kind: entry.chunk.kind.clone(),
2191                    start_line: entry.chunk.start_line,
2192                    end_line: entry.chunk.end_line,
2193                    exported: entry.chunk.exported,
2194                    snippet: entry.chunk.snippet.clone(),
2195                    score,
2196                    source: "semantic",
2197                }
2198            })
2199            .collect()
2200    }
2201
2202    /// Number of indexed entries
2203    pub fn len(&self) -> usize {
2204        self.entries.len()
2205    }
2206
2207    /// Check if a file needs re-indexing based on mtime/size
2208    pub fn is_file_stale(&self, file: &Path) -> bool {
2209        let Some(stored_mtime) = self.file_mtimes.get(file) else {
2210            return true;
2211        };
2212        let Some(stored_size) = self.file_sizes.get(file) else {
2213            return true;
2214        };
2215        let Some(stored_hash) = self.file_hashes.get(file) else {
2216            return true;
2217        };
2218        let cached = FileFreshness {
2219            mtime: *stored_mtime,
2220            size: *stored_size,
2221            content_hash: *stored_hash,
2222        };
2223        match cache_freshness::verify_file_strict(file, &cached) {
2224            FreshnessVerdict::HotFresh => false,
2225            FreshnessVerdict::ContentFresh { .. } => false,
2226            FreshnessVerdict::Stale | FreshnessVerdict::Deleted => true,
2227        }
2228    }
2229
2230    fn backfill_missing_file_sizes(&mut self) {
2231        for path in self.file_mtimes.keys() {
2232            if self.file_sizes.contains_key(path) {
2233                continue;
2234            }
2235            if let Ok(metadata) = fs::metadata(path) {
2236                self.file_sizes.insert(path.clone(), metadata.len());
2237                if let Ok(Some(hash)) = cache_freshness::hash_file_if_small(path, metadata.len()) {
2238                    self.file_hashes.insert(path.clone(), hash);
2239                }
2240            }
2241        }
2242    }
2243
2244    /// Remove entries for a specific file
2245    pub fn remove_file(&mut self, file: &Path) {
2246        self.invalidate_file(file);
2247    }
2248
2249    pub fn invalidate_file(&mut self, file: &Path) {
2250        let canonical_file = canonicalize_existing_or_deleted_path(file);
2251        self.entries
2252            .retain(|e| e.chunk.file != file && e.chunk.file != canonical_file);
2253        self.file_mtimes.remove(file);
2254        self.file_sizes.remove(file);
2255        self.file_hashes.remove(file);
2256        if canonical_file.as_path() != file {
2257            self.file_mtimes.remove(&canonical_file);
2258            self.file_sizes.remove(&canonical_file);
2259            self.file_hashes.remove(&canonical_file);
2260        }
2261    }
2262
2263    /// Get the embedding dimension
2264    pub fn dimension(&self) -> usize {
2265        self.dimension
2266    }
2267
2268    pub fn fingerprint(&self) -> Option<&SemanticIndexFingerprint> {
2269        self.fingerprint.as_ref()
2270    }
2271
2272    pub fn backend_label(&self) -> Option<&str> {
2273        self.fingerprint.as_ref().map(|f| f.backend.as_str())
2274    }
2275
2276    pub fn model_label(&self) -> Option<&str> {
2277        self.fingerprint.as_ref().map(|f| f.model.as_str())
2278    }
2279
2280    pub fn set_fingerprint(&mut self, fingerprint: SemanticIndexFingerprint) {
2281        self.fingerprint = Some(fingerprint);
2282    }
2283
2284    /// Write the semantic index to disk using atomic temp+rename pattern
2285    pub fn write_to_disk(&self, storage_dir: &Path, project_key: &str) {
2286        // Don't persist empty indexes — they would be loaded on next startup
2287        // and prevent a fresh build that might find files.
2288        if self.entries.is_empty() {
2289            slog_info!("skipping semantic index persistence (0 entries)");
2290            return;
2291        }
2292        let dir = storage_dir.join("semantic").join(project_key);
2293        if let Err(e) = fs::create_dir_all(&dir) {
2294            slog_warn!("failed to create semantic cache dir: {}", e);
2295            return;
2296        }
2297        let data_path = dir.join("semantic.bin");
2298        let tmp_path = dir.join(format!(
2299            "semantic.bin.tmp.{}.{}",
2300            std::process::id(),
2301            SystemTime::now()
2302                .duration_since(SystemTime::UNIX_EPOCH)
2303                .unwrap_or(Duration::ZERO)
2304                .as_nanos()
2305        ));
2306        let write_result = (|| -> io::Result<usize> {
2307            let file = fs::File::create(&tmp_path)?;
2308            let mut writer = BufWriter::new(file);
2309            let bytes_written = self.write_to_writer(&mut writer)?;
2310            writer.flush()?;
2311            writer.get_ref().sync_all()?;
2312            Ok(bytes_written)
2313        })();
2314        let bytes_written = match write_result {
2315            Ok(bytes_written) => bytes_written,
2316            Err(e) => {
2317                slog_warn!("failed to write semantic index: {}", e);
2318                let _ = fs::remove_file(&tmp_path);
2319                return;
2320            }
2321        };
2322        if let Err(e) = fs::rename(&tmp_path, &data_path) {
2323            slog_warn!("failed to rename semantic index: {}", e);
2324            let _ = fs::remove_file(&tmp_path);
2325            return;
2326        }
2327        slog_info!(
2328            "semantic index persisted: {} entries, {:.1} KB",
2329            self.entries.len(),
2330            bytes_written as f64 / 1024.0
2331        );
2332    }
2333
2334    /// Read the semantic index from disk
2335    pub fn read_from_disk(
2336        storage_dir: &Path,
2337        project_key: &str,
2338        current_canonical_root: &Path,
2339        is_worktree_bridge: bool,
2340        expected_fingerprint: Option<&str>,
2341    ) -> Option<Self> {
2342        debug_assert!(current_canonical_root.is_absolute());
2343        let data_path = storage_dir
2344            .join("semantic")
2345            .join(project_key)
2346            .join("semantic.bin");
2347        let file = fs::File::open(&data_path).ok()?;
2348        let file_len = usize::try_from(file.metadata().ok()?.len()).ok()?;
2349        if file_len < HEADER_BYTES_V1 {
2350            slog_warn!(
2351                "corrupt semantic index (too small: {} bytes), removing",
2352                file_len
2353            );
2354            if !is_worktree_bridge {
2355                let _ = fs::remove_file(&data_path);
2356            }
2357            return None;
2358        }
2359
2360        let mut reader = BufReader::new(file);
2361        let mut version_buf = [0u8; 1];
2362        reader.read_exact(&mut version_buf).ok()?;
2363        let version = version_buf[0];
2364        if version != SEMANTIC_INDEX_VERSION_V6 {
2365            slog_info!(
2366                "cached semantic index version {} is older than {}, rebuilding",
2367                version,
2368                SEMANTIC_INDEX_VERSION_V6
2369            );
2370            if !is_worktree_bridge {
2371                let _ = fs::remove_file(&data_path);
2372            }
2373            return None;
2374        }
2375        match Self::from_reader_after_version(
2376            reader,
2377            version,
2378            current_canonical_root,
2379            Some(file_len),
2380            1,
2381        ) {
2382            Ok(index) => {
2383                if index.entries.is_empty() {
2384                    slog_info!("cached semantic index is empty, will rebuild");
2385                    if !is_worktree_bridge {
2386                        let _ = fs::remove_file(&data_path);
2387                    }
2388                    return None;
2389                }
2390                if let Some(expected) = expected_fingerprint {
2391                    let matches = index
2392                        .fingerprint()
2393                        .map(|fingerprint| fingerprint.matches_expected(expected))
2394                        .unwrap_or(false);
2395                    if !matches {
2396                        slog_info!("cached semantic index fingerprint mismatch, rebuilding");
2397                        if !is_worktree_bridge {
2398                            let _ = fs::remove_file(&data_path);
2399                        }
2400                        return None;
2401                    }
2402                }
2403                slog_info!(
2404                    "loaded semantic index from disk: {} entries",
2405                    index.entries.len()
2406                );
2407                Some(index)
2408            }
2409            Err(e) => {
2410                slog_warn!("corrupt semantic index, rebuilding: {}", e);
2411                if !is_worktree_bridge {
2412                    let _ = fs::remove_file(&data_path);
2413                }
2414                None
2415            }
2416        }
2417    }
2418
2419    /// Serialize the index to bytes for disk persistence
2420    pub fn to_bytes(&self) -> Vec<u8> {
2421        let mut buf = Vec::new();
2422        self.write_to_writer(&mut buf)
2423            .expect("writing semantic index to Vec cannot fail");
2424        buf
2425    }
2426
2427    fn write_to_writer<W: Write>(&self, writer: &mut W) -> io::Result<usize> {
2428        let mut bytes_written = 0usize;
2429        let fingerprint = self.fingerprint.as_ref().and_then(|fingerprint| {
2430            let encoded = fingerprint.as_string();
2431            if encoded.is_empty() {
2432                None
2433            } else {
2434                Some(encoded)
2435            }
2436        });
2437        let fp_bytes_ref = fingerprint.as_deref().map(str::as_bytes).unwrap_or(&[]);
2438        let file_mtime_count = self
2439            .file_mtimes
2440            .iter()
2441            .filter(|(path, _)| cache_relative_path(&self.project_root, path).is_some())
2442            .count();
2443        let entry_count = self
2444            .entries
2445            .iter()
2446            .filter(|entry| cache_relative_path(&self.project_root, &entry.chunk.file).is_some())
2447            .count();
2448
2449        // Header: version(1) + dimension(4) + entry_count(4) + fingerprint_len(4) + fingerprint
2450        //
2451        // V6 is the single write format. Layout extends V5:
2452        //   - fingerprint is always represented (absent ⇒ fingerprint_len=0,
2453        //     no bytes follow). Uniform format simplifies the reader.
2454        //   - paths are relative to project_root.
2455        //   - file metadata stored as secs(u64) + subsec_nanos(u32) + size(u64) + blake3(32).
2456        //     Preserves full APFS/ext4/NTFS precision and catches mtime ties.
2457        //
2458        // V1/V2 remain readable for backward compatibility (see from_bytes).
2459        // V3/V4 load as compatible formats but are rejected on disk so snippets
2460        // and file sizes are rebuilt once.
2461        let version = SEMANTIC_INDEX_VERSION_V6;
2462        write_counted(writer, &[version], &mut bytes_written)?;
2463        write_counted(
2464            writer,
2465            &(self.dimension as u32).to_le_bytes(),
2466            &mut bytes_written,
2467        )?;
2468        write_counted(
2469            writer,
2470            &(entry_count as u32).to_le_bytes(),
2471            &mut bytes_written,
2472        )?;
2473        write_counted(
2474            writer,
2475            &(fp_bytes_ref.len() as u32).to_le_bytes(),
2476            &mut bytes_written,
2477        )?;
2478        write_counted(writer, fp_bytes_ref, &mut bytes_written)?;
2479
2480        // File mtime table: count(4) + entries
2481        // V3 layout per entry: path_len(4) + path + secs(8) + subsec_nanos(4)
2482        write_counted(
2483            writer,
2484            &(file_mtime_count as u32).to_le_bytes(),
2485            &mut bytes_written,
2486        )?;
2487        for (path, mtime) in &self.file_mtimes {
2488            let Some(relative) = cache_relative_path(&self.project_root, path) else {
2489                continue;
2490            };
2491            let relative = relative.to_string_lossy();
2492            let path_bytes = relative.as_bytes();
2493            write_counted(
2494                writer,
2495                &(path_bytes.len() as u32).to_le_bytes(),
2496                &mut bytes_written,
2497            )?;
2498            write_counted(writer, path_bytes, &mut bytes_written)?;
2499            let duration = mtime
2500                .duration_since(SystemTime::UNIX_EPOCH)
2501                .unwrap_or_default();
2502            write_counted(
2503                writer,
2504                &duration.as_secs().to_le_bytes(),
2505                &mut bytes_written,
2506            )?;
2507            write_counted(
2508                writer,
2509                &duration.subsec_nanos().to_le_bytes(),
2510                &mut bytes_written,
2511            )?;
2512            let size = self.file_sizes.get(path).copied().unwrap_or_default();
2513            write_counted(writer, &size.to_le_bytes(), &mut bytes_written)?;
2514            let hash = self
2515                .file_hashes
2516                .get(path)
2517                .copied()
2518                .unwrap_or_else(cache_freshness::zero_hash);
2519            write_counted(writer, hash.as_bytes(), &mut bytes_written)?;
2520        }
2521
2522        // Entries: each is metadata + vector
2523        for entry in &self.entries {
2524            let Some(relative) = cache_relative_path(&self.project_root, &entry.chunk.file) else {
2525                continue;
2526            };
2527            let c = &entry.chunk;
2528
2529            // File path
2530            let relative = relative.to_string_lossy();
2531            let file_bytes = relative.as_bytes();
2532            write_counted(
2533                writer,
2534                &(file_bytes.len() as u32).to_le_bytes(),
2535                &mut bytes_written,
2536            )?;
2537            write_counted(writer, file_bytes, &mut bytes_written)?;
2538
2539            // Name
2540            let name_bytes = c.name.as_bytes();
2541            write_counted(
2542                writer,
2543                &(name_bytes.len() as u32).to_le_bytes(),
2544                &mut bytes_written,
2545            )?;
2546            write_counted(writer, name_bytes, &mut bytes_written)?;
2547
2548            // Kind (1 byte)
2549            write_counted(writer, &[symbol_kind_to_u8(&c.kind)], &mut bytes_written)?;
2550
2551            // Lines + exported
2552            write_counted(
2553                writer,
2554                &(c.start_line as u32).to_le_bytes(),
2555                &mut bytes_written,
2556            )?;
2557            write_counted(
2558                writer,
2559                &(c.end_line as u32).to_le_bytes(),
2560                &mut bytes_written,
2561            )?;
2562            write_counted(writer, &[c.exported as u8], &mut bytes_written)?;
2563
2564            // Snippet
2565            let snippet_bytes = c.snippet.as_bytes();
2566            write_counted(
2567                writer,
2568                &(snippet_bytes.len() as u32).to_le_bytes(),
2569                &mut bytes_written,
2570            )?;
2571            write_counted(writer, snippet_bytes, &mut bytes_written)?;
2572
2573            // Embed text
2574            let embed_bytes = c.embed_text.as_bytes();
2575            write_counted(
2576                writer,
2577                &(embed_bytes.len() as u32).to_le_bytes(),
2578                &mut bytes_written,
2579            )?;
2580            write_counted(writer, embed_bytes, &mut bytes_written)?;
2581
2582            // Vector (f32 array)
2583            for &val in &entry.vector {
2584                write_counted(writer, &val.to_le_bytes(), &mut bytes_written)?;
2585            }
2586        }
2587
2588        Ok(bytes_written)
2589    }
2590
2591    /// Deserialize the index from bytes
2592    pub fn from_bytes(data: &[u8], current_canonical_root: &Path) -> Result<Self, String> {
2593        debug_assert!(current_canonical_root.is_absolute());
2594        if data.len() < HEADER_BYTES_V1 {
2595            return Err("data too short".to_string());
2596        }
2597
2598        Self::from_reader_after_version(
2599            Cursor::new(&data[1..]),
2600            data[0],
2601            current_canonical_root,
2602            Some(data.len()),
2603            1,
2604        )
2605    }
2606
2607    fn from_reader_after_version<R: Read>(
2608        reader: R,
2609        version: u8,
2610        current_canonical_root: &Path,
2611        total_len: Option<usize>,
2612        bytes_read: usize,
2613    ) -> Result<Self, String> {
2614        debug_assert!(current_canonical_root.is_absolute());
2615        let mut reader = CountingReader::with_bytes_read(reader, bytes_read);
2616
2617        if version != SEMANTIC_INDEX_VERSION_V1
2618            && version != SEMANTIC_INDEX_VERSION_V2
2619            && version != SEMANTIC_INDEX_VERSION_V3
2620            && version != SEMANTIC_INDEX_VERSION_V4
2621            && version != SEMANTIC_INDEX_VERSION_V5
2622            && version != SEMANTIC_INDEX_VERSION_V6
2623        {
2624            return Err(format!("unsupported version: {}", version));
2625        }
2626        // V2 and newer share the same header layout (V3/V4/V5 only differ from
2627        // V2 in the per-mtime entry layout): version(1) + dimension(4) +
2628        // entry_count(4) + fingerprint_len(4) + fingerprint bytes.
2629        if (version == SEMANTIC_INDEX_VERSION_V2
2630            || version == SEMANTIC_INDEX_VERSION_V3
2631            || version == SEMANTIC_INDEX_VERSION_V4
2632            || version == SEMANTIC_INDEX_VERSION_V5
2633            || version == SEMANTIC_INDEX_VERSION_V6)
2634            && total_len.is_some_and(|len| len < HEADER_BYTES_V2)
2635        {
2636            return Err("data too short for semantic index v2/v3/v4/v5/v6 header".to_string());
2637        }
2638
2639        let dimension = read_u32_stream(&mut reader)? as usize;
2640        let entry_count = read_u32_stream(&mut reader)? as usize;
2641        validate_embedding_dimension(dimension)?;
2642        if entry_count > MAX_ENTRIES {
2643            return Err(format!("too many semantic index entries: {}", entry_count));
2644        }
2645
2646        // Fingerprint handling:
2647        //   - V1: no fingerprint field at all.
2648        //   - V2: fingerprint_len + fingerprint bytes; always present (writer
2649        //     only emitted V2 when fingerprint was Some).
2650        //   - V3+: fingerprint_len always present; fingerprint_len==0 ⇒ None.
2651        let has_fingerprint_field = version == SEMANTIC_INDEX_VERSION_V2
2652            || version == SEMANTIC_INDEX_VERSION_V3
2653            || version == SEMANTIC_INDEX_VERSION_V4
2654            || version == SEMANTIC_INDEX_VERSION_V5
2655            || version == SEMANTIC_INDEX_VERSION_V6;
2656        let fingerprint = if has_fingerprint_field {
2657            let fingerprint_len = read_u32_stream(&mut reader)? as usize;
2658            if total_len
2659                .is_some_and(|len| reader.bytes_read().saturating_add(fingerprint_len) > len)
2660            {
2661                return Err("unexpected end of data reading fingerprint".to_string());
2662            }
2663            if fingerprint_len == 0 {
2664                None
2665            } else {
2666                let mut raw = vec![0u8; fingerprint_len];
2667                read_exact_stream(
2668                    &mut reader,
2669                    &mut raw,
2670                    "unexpected end of data reading fingerprint",
2671                )?;
2672                let raw = String::from_utf8_lossy(&raw).to_string();
2673                Some(
2674                    serde_json::from_str::<SemanticIndexFingerprint>(&raw)
2675                        .map_err(|error| format!("invalid semantic fingerprint: {error}"))?,
2676                )
2677            }
2678        } else {
2679            None
2680        };
2681
2682        // File mtimes
2683        let mtime_count = read_u32_stream(&mut reader)? as usize;
2684        if mtime_count > MAX_ENTRIES {
2685            return Err(format!("too many semantic file mtimes: {}", mtime_count));
2686        }
2687
2688        let vector_bytes = entry_count
2689            .checked_mul(dimension)
2690            .and_then(|count| count.checked_mul(F32_BYTES))
2691            .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
2692        if total_len.is_some_and(|len| vector_bytes > len.saturating_sub(reader.bytes_read())) {
2693            return Err("semantic index vectors exceed available data".to_string());
2694        }
2695
2696        let mut file_mtimes = HashMap::with_capacity(mtime_count);
2697        let mut file_sizes = HashMap::with_capacity(mtime_count);
2698        let mut file_hashes = HashMap::with_capacity(mtime_count);
2699        for _ in 0..mtime_count {
2700            let path = read_string_stream(&mut reader, total_len)?;
2701            let secs = read_u64_stream(&mut reader)?;
2702            // V3+ persists subsec_nanos alongside secs so staleness checks
2703            // survive restart round-trips. V1/V2 load with 0 nanos, which
2704            // causes one rebuild on upgrade (they never matched live APFS
2705            // mtimes anyway — the bug v0.15.2 fixes). After that rebuild,
2706            // the cache is persisted as V3 and stabilises.
2707            let nanos = if version == SEMANTIC_INDEX_VERSION_V3
2708                || version == SEMANTIC_INDEX_VERSION_V4
2709                || version == SEMANTIC_INDEX_VERSION_V5
2710                || version == SEMANTIC_INDEX_VERSION_V6
2711            {
2712                read_u32_stream(&mut reader)?
2713            } else {
2714                0
2715            };
2716            let size =
2717                if version == SEMANTIC_INDEX_VERSION_V5 || version == SEMANTIC_INDEX_VERSION_V6 {
2718                    read_u64_stream(&mut reader)?
2719                } else {
2720                    0
2721                };
2722            let content_hash = if version == SEMANTIC_INDEX_VERSION_V6 {
2723                let mut hash_bytes = [0u8; 32];
2724                read_exact_stream(
2725                    &mut reader,
2726                    &mut hash_bytes,
2727                    "unexpected end of data reading content hash",
2728                )?;
2729                blake3::Hash::from_bytes(hash_bytes)
2730            } else {
2731                cache_freshness::zero_hash()
2732            };
2733            // Hardening against corrupt / maliciously crafted cache files
2734            // (v0.15.2). `Duration::new(secs, nanos)` can panic when the
2735            // nanosecond carry overflows the second counter, and
2736            // `SystemTime + Duration` can panic on carry past the platform's
2737            // upper bound. Explicit validation keeps a corrupted semantic.bin
2738            // from taking down the whole aft process.
2739            if nanos >= 1_000_000_000 {
2740                return Err(format!(
2741                    "invalid semantic mtime: nanos {} >= 1_000_000_000",
2742                    nanos
2743                ));
2744            }
2745            let duration = std::time::Duration::new(secs, nanos);
2746            let mtime = SystemTime::UNIX_EPOCH
2747                .checked_add(duration)
2748                .ok_or_else(|| {
2749                    format!(
2750                        "invalid semantic mtime: secs={} nanos={} overflows SystemTime",
2751                        secs, nanos
2752                    )
2753                })?;
2754            let path = if version == SEMANTIC_INDEX_VERSION_V6 {
2755                cached_path_under_root(current_canonical_root, &PathBuf::from(path))
2756                    .ok_or_else(|| "cached semantic mtime path escapes project root".to_string())?
2757            } else {
2758                PathBuf::from(path)
2759            };
2760            file_mtimes.insert(path.clone(), mtime);
2761            file_sizes.insert(path.clone(), size);
2762            file_hashes.insert(path, content_hash);
2763        }
2764
2765        // Entries
2766        let mut entries = Vec::with_capacity(entry_count);
2767        for _ in 0..entry_count {
2768            let raw_file = PathBuf::from(read_string_stream(&mut reader, total_len)?);
2769            let file = if version == SEMANTIC_INDEX_VERSION_V6 {
2770                cached_path_under_root(current_canonical_root, &raw_file)
2771                    .ok_or_else(|| "cached semantic entry path escapes project root".to_string())?
2772            } else {
2773                raw_file
2774            };
2775            let name = read_string_stream(&mut reader, total_len)?;
2776
2777            let kind = u8_to_symbol_kind(read_u8_stream(&mut reader, "unexpected end of data")?);
2778
2779            let start_line = read_u32_stream(&mut reader)?;
2780            let end_line = read_u32_stream(&mut reader)?;
2781
2782            let exported = read_u8_stream(&mut reader, "unexpected end of data")? != 0;
2783
2784            let snippet = read_string_stream(&mut reader, total_len)?;
2785            let embed_text = read_string_stream(&mut reader, total_len)?;
2786
2787            // Vector
2788            let vec_bytes = dimension
2789                .checked_mul(F32_BYTES)
2790                .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
2791            if total_len.is_some_and(|len| reader.bytes_read().saturating_add(vec_bytes) > len) {
2792                return Err("unexpected end of data reading vector".to_string());
2793            }
2794            let mut vector = Vec::with_capacity(dimension);
2795            for _ in 0..dimension {
2796                let mut bytes = [0u8; F32_BYTES];
2797                read_exact_stream(
2798                    &mut reader,
2799                    &mut bytes,
2800                    "unexpected end of data reading vector",
2801                )?;
2802                vector.push(f32::from_le_bytes(bytes));
2803            }
2804
2805            entries.push(EmbeddingEntry {
2806                chunk: SemanticChunk {
2807                    file,
2808                    name,
2809                    kind,
2810                    start_line,
2811                    end_line,
2812                    exported,
2813                    embed_text,
2814                    snippet,
2815                },
2816                vector,
2817            });
2818        }
2819
2820        if entries.len() != entry_count {
2821            return Err(format!(
2822                "semantic cache entry count drift: header={} decoded={}",
2823                entry_count,
2824                entries.len()
2825            ));
2826        }
2827        for entry in &entries {
2828            if !file_mtimes.contains_key(&entry.chunk.file) {
2829                return Err(format!(
2830                    "semantic cache metadata missing for entry file {}",
2831                    entry.chunk.file.display()
2832                ));
2833            }
2834        }
2835
2836        Ok(Self {
2837            entries,
2838            file_mtimes,
2839            file_sizes,
2840            file_hashes,
2841            dimension,
2842            fingerprint,
2843            project_root: current_canonical_root.to_path_buf(),
2844            deferred_files: HashSet::new(),
2845        })
2846    }
2847}
2848
2849fn write_counted<W: Write>(
2850    writer: &mut W,
2851    bytes: &[u8],
2852    bytes_written: &mut usize,
2853) -> io::Result<()> {
2854    writer.write_all(bytes)?;
2855    *bytes_written = bytes_written.saturating_add(bytes.len());
2856    Ok(())
2857}
2858
2859struct CountingReader<R> {
2860    inner: R,
2861    bytes_read: usize,
2862}
2863
2864impl<R> CountingReader<R> {
2865    fn with_bytes_read(inner: R, bytes_read: usize) -> Self {
2866        Self { inner, bytes_read }
2867    }
2868
2869    fn bytes_read(&self) -> usize {
2870        self.bytes_read
2871    }
2872}
2873
2874impl<R: Read> Read for CountingReader<R> {
2875    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
2876        let read = self.inner.read(buf)?;
2877        self.bytes_read = self.bytes_read.saturating_add(read);
2878        Ok(read)
2879    }
2880}
2881
2882fn read_exact_stream<R: Read>(
2883    reader: &mut CountingReader<R>,
2884    buf: &mut [u8],
2885    eof_message: &'static str,
2886) -> Result<(), String> {
2887    reader.read_exact(buf).map_err(|error| {
2888        if error.kind() == io::ErrorKind::UnexpectedEof {
2889            eof_message.to_string()
2890        } else {
2891            format!("{eof_message}: {error}")
2892        }
2893    })
2894}
2895
2896fn read_u8_stream<R: Read>(
2897    reader: &mut CountingReader<R>,
2898    eof_message: &'static str,
2899) -> Result<u8, String> {
2900    let mut bytes = [0u8; 1];
2901    read_exact_stream(reader, &mut bytes, eof_message)?;
2902    Ok(bytes[0])
2903}
2904
2905fn read_u32_stream<R: Read>(reader: &mut CountingReader<R>) -> Result<u32, String> {
2906    let mut bytes = [0u8; 4];
2907    read_exact_stream(reader, &mut bytes, "unexpected end of data reading u32")?;
2908    Ok(u32::from_le_bytes(bytes))
2909}
2910
2911fn read_u64_stream<R: Read>(reader: &mut CountingReader<R>) -> Result<u64, String> {
2912    let mut bytes = [0u8; 8];
2913    read_exact_stream(reader, &mut bytes, "unexpected end of data reading u64")?;
2914    Ok(u64::from_le_bytes(bytes))
2915}
2916
2917fn read_string_stream<R: Read>(
2918    reader: &mut CountingReader<R>,
2919    total_len: Option<usize>,
2920) -> Result<String, String> {
2921    let len = read_u32_stream(reader)? as usize;
2922    if total_len.is_some_and(|total_len| reader.bytes_read().saturating_add(len) > total_len) {
2923        return Err("unexpected end of data reading string".to_string());
2924    }
2925    let mut bytes = vec![0u8; len];
2926    read_exact_stream(reader, &mut bytes, "unexpected end of data reading string")?;
2927    Ok(String::from_utf8_lossy(&bytes).to_string())
2928}
2929
2930struct SourceLineCache<'a> {
2931    lines: Vec<&'a str>,
2932    line_starts: Vec<usize>,
2933}
2934
2935impl<'a> SourceLineCache<'a> {
2936    fn new(source: &'a str) -> Self {
2937        let lines: Vec<&'a str> = source.lines().collect();
2938        let mut line_starts = Vec::with_capacity(lines.len());
2939        let bytes = source.as_bytes();
2940        let mut offset = 0usize;
2941        for line in &lines {
2942            line_starts.push(offset);
2943            offset += line.len();
2944            if bytes.get(offset) == Some(&b'\r') && bytes.get(offset + 1) == Some(&b'\n') {
2945                offset += 2;
2946            } else if bytes.get(offset) == Some(&b'\n') {
2947                offset += 1;
2948            }
2949        }
2950        Self { lines, line_starts }
2951    }
2952
2953    fn len(&self) -> usize {
2954        debug_assert_eq!(self.lines.len(), self.line_starts.len());
2955        self.line_starts.len()
2956    }
2957}
2958
2959/// Build enriched embedding text from a symbol with cAST-style context
2960fn build_embed_text_with_lines(
2961    symbol: &Symbol,
2962    line_cache: &SourceLineCache<'_>,
2963    file: &Path,
2964    project_root: &Path,
2965) -> String {
2966    let relative = file
2967        .strip_prefix(project_root)
2968        .unwrap_or(file)
2969        .to_string_lossy();
2970
2971    let kind_label = match &symbol.kind {
2972        SymbolKind::Function => "function",
2973        SymbolKind::Class => "class",
2974        SymbolKind::Method => "method",
2975        SymbolKind::Struct => "struct",
2976        SymbolKind::Interface => "interface",
2977        SymbolKind::Enum => "enum",
2978        SymbolKind::TypeAlias => "type",
2979        SymbolKind::Variable => "variable",
2980        SymbolKind::Heading => "heading",
2981        SymbolKind::FileSummary => "file-summary",
2982    };
2983
2984    // Build: "file:relative/path kind:function name:validateAuth signature:fn validateAuth(token: &str) -> bool"
2985    let name = &symbol.name;
2986    let mut text = format!(
2987        "name:{name} file:{} kind:{} name:{name}",
2988        relative, kind_label
2989    );
2990
2991    if let Some(sig) = &symbol.signature {
2992        // Cap the signature: structured parsers (e.g. YAML/Kubernetes) pack
2993        // entire inline scripts (CronJob/Job `command:` bodies, multi-KB) into
2994        // the signature. Appending it unbounded produces a single embed_text
2995        // that overflows the embedding backend's physical batch (e.g. a
2996        // llama.cpp server's 512-token cap), aborting the whole index build
2997        // and silently degrading every search to lexical. 400 chars keeps the
2998        // identifying head of the signature without blowing the budget.
2999        text.push_str(&format!(" signature:{}", truncate_chars(sig, 400)));
3000    }
3001
3002    // Add body snippet (first ~300 chars of symbol body)
3003    let start = (symbol.range.start_line as usize).min(line_cache.len());
3004    // range.end_line is inclusive 0-based; +1 makes it an exclusive slice bound.
3005    let end = (symbol.range.end_line as usize + 1).min(line_cache.len());
3006    if start < end {
3007        let body: String = line_cache.lines[start..end]
3008            .iter()
3009            .take(15) // max 15 lines
3010            .copied()
3011            .collect::<Vec<&str>>()
3012            .join("\n");
3013        let snippet = if body.len() > 300 {
3014            format!("{}...", &body[..body.floor_char_boundary(300)])
3015        } else {
3016            body
3017        };
3018        text.push_str(&format!(" body:{}", snippet));
3019    }
3020
3021    // Final defense-in-depth clamp: no single embed_text may exceed the
3022    // backend's per-input budget regardless of which field grew. Most
3023    // backends cap a physical batch around 512 tokens; ~1600 chars stays
3024    // comfortably under that for typical English/code (≈4 chars/token).
3025    truncate_chars(&text, MAX_EMBED_TEXT_CHARS)
3026}
3027
3028#[cfg(test)]
3029fn build_embed_text(symbol: &Symbol, source: &str, file: &Path, project_root: &Path) -> String {
3030    let line_cache = SourceLineCache::new(source);
3031    build_embed_text_with_lines(symbol, &line_cache, file, project_root)
3032}
3033
3034/// Upper bound on characters in a single chunk's `embed_text`. Keeps any one
3035/// input below typical embedding-backend physical batch limits (~512 tokens)
3036/// so an oversized symbol cannot abort the whole index build.
3037const MAX_EMBED_TEXT_CHARS: usize = 1600;
3038
3039fn truncate_chars(value: &str, max_chars: usize) -> String {
3040    value.chars().take(max_chars).collect()
3041}
3042
3043fn first_leading_doc_comment(line_cache: &SourceLineCache<'_>) -> String {
3044    let Some((start, first)) = line_cache
3045        .lines
3046        .iter()
3047        .enumerate()
3048        .find(|(_, line)| !line.trim().is_empty())
3049    else {
3050        return String::new();
3051    };
3052
3053    let trimmed = first.trim_start();
3054    if trimmed.starts_with("/**") {
3055        let mut comment = Vec::new();
3056        for line in line_cache.lines.iter().skip(start) {
3057            comment.push(*line);
3058            if line.contains("*/") {
3059                break;
3060            }
3061        }
3062        return truncate_chars(&comment.join("\n"), 200);
3063    }
3064
3065    if trimmed.starts_with("///") || trimmed.starts_with("//!") {
3066        let comment = line_cache
3067            .lines
3068            .iter()
3069            .skip(start)
3070            .take_while(|line| {
3071                let trimmed = line.trim_start();
3072                trimmed.starts_with("///") || trimmed.starts_with("//!")
3073            })
3074            .copied()
3075            .collect::<Vec<_>>()
3076            .join("\n");
3077        return truncate_chars(&comment, 200);
3078    }
3079
3080    String::new()
3081}
3082
3083pub fn build_file_summary_chunk(
3084    file: &Path,
3085    project_root: &Path,
3086    source: &str,
3087    top_exports: &[&str],
3088    top_export_signatures: &[Option<&str>],
3089) -> SemanticChunk {
3090    let line_cache = SourceLineCache::new(source);
3091    build_file_summary_chunk_with_lines(
3092        file,
3093        project_root,
3094        &line_cache,
3095        top_exports,
3096        top_export_signatures,
3097    )
3098}
3099
3100fn build_file_summary_chunk_with_lines(
3101    file: &Path,
3102    project_root: &Path,
3103    line_cache: &SourceLineCache<'_>,
3104    top_exports: &[&str],
3105    top_export_signatures: &[Option<&str>],
3106) -> SemanticChunk {
3107    let relative = file.strip_prefix(project_root).unwrap_or(file);
3108    let rel_path = relative.to_string_lossy();
3109    let parent_dir = relative
3110        .parent()
3111        .map(|parent| parent.to_string_lossy().to_string())
3112        .unwrap_or_default();
3113    let name = file
3114        .file_stem()
3115        .map(|stem| stem.to_string_lossy().to_string())
3116        .unwrap_or_default();
3117    let doc = first_leading_doc_comment(line_cache);
3118    let exports = top_exports
3119        .iter()
3120        .take(5)
3121        .copied()
3122        .collect::<Vec<_>>()
3123        .join(",");
3124    let snippet = if doc.is_empty() {
3125        top_export_signatures
3126            .first()
3127            .and_then(|signature| signature.as_deref())
3128            .map(|signature| truncate_chars(signature, 200))
3129            .unwrap_or_default()
3130    } else {
3131        doc.clone()
3132    };
3133
3134    SemanticChunk {
3135        file: file.to_path_buf(),
3136        name,
3137        kind: SymbolKind::FileSummary,
3138        start_line: 0,
3139        end_line: 0,
3140        exported: false,
3141        embed_text: truncate_chars(
3142            &format!(
3143                "file:{rel_path} kind:file-summary name:{} parent:{parent_dir} doc:{doc} exports:{exports}",
3144                file.file_stem()
3145                    .map(|stem| stem.to_string_lossy().to_string())
3146                    .unwrap_or_default()
3147            ),
3148            MAX_EMBED_TEXT_CHARS,
3149        ),
3150        snippet,
3151    }
3152}
3153
3154fn parser_for(
3155    parsers: &mut HashMap<crate::parser::LangId, Parser>,
3156    lang: crate::parser::LangId,
3157) -> Result<&mut Parser, String> {
3158    use std::collections::hash_map::Entry;
3159
3160    match parsers.entry(lang) {
3161        Entry::Occupied(entry) => Ok(entry.into_mut()),
3162        Entry::Vacant(entry) => {
3163            let grammar = grammar_for(lang);
3164            let mut parser = Parser::new();
3165            parser
3166                .set_language(&grammar)
3167                .map_err(|error| error.to_string())?;
3168            Ok(entry.insert(parser))
3169        }
3170    }
3171}
3172
3173pub fn is_semantic_indexed_extension(path: &Path) -> bool {
3174    matches!(
3175        path.extension().and_then(|extension| extension.to_str()),
3176        Some(
3177            "ts" | "tsx"
3178                | "js"
3179                | "jsx"
3180                | "py"
3181                | "rs"
3182                | "go"
3183                | "c"
3184                | "h"
3185                | "cc"
3186                | "cpp"
3187                | "cxx"
3188                | "hpp"
3189                | "hh"
3190                | "zig"
3191                | "cs"
3192                | "sh"
3193                | "bash"
3194                | "zsh"
3195                | "inc"
3196                | "php"
3197                | "sol"
3198                | "scss"
3199                | "vue"
3200                | "yaml"
3201                | "yml"
3202                | "pas"
3203                | "pp"
3204                | "dpr"
3205                | "dpk"
3206                | "lpr"
3207                | "java"
3208                | "kt"
3209                | "kts"
3210                | "rb"
3211                | "swift"
3212                | "scala"
3213                | "sc"
3214                | "lua"
3215                | "pl"
3216                | "pm"
3217                | "t"
3218                | "r"
3219                | "R",
3220        )
3221    )
3222}
3223
3224fn canonicalize_existing_or_deleted_path(path: &Path) -> PathBuf {
3225    if let Ok(canonical) = fs::canonicalize(path) {
3226        return canonical;
3227    }
3228
3229    let Some(parent) = path.parent() else {
3230        return path.to_path_buf();
3231    };
3232    let Some(file_name) = path.file_name() else {
3233        return path.to_path_buf();
3234    };
3235
3236    fs::canonicalize(parent)
3237        .map(|canonical_parent| canonical_parent.join(file_name))
3238        .unwrap_or_else(|_| path.to_path_buf())
3239}
3240
3241/// Files larger than this are skipped for semantic chunking. The read +
3242/// tree-sitter parse is transiently O(file size) (tree-sitter can use several×
3243/// the source bytes), and `par_iter` collection parses many files at once, so an
3244/// unbounded read here is an OOM vector on a repo with a few multi-MB generated/
3245/// vendored/minified files. A file this large yields almost no useful embedding
3246/// anyway (each chunk's embed_text is clamped to MAX_EMBED_TEXT_CHARS), so we
3247/// track it (0 chunks) instead of reading it — freshness then skips it on later
3248/// refreshes. 4 MiB keeps essentially all hand-written source while capping the
3249/// pathological tail.
3250const MAX_SEMANTIC_FILE_BYTES: u64 = 4 * 1024 * 1024;
3251
3252fn collect_semantic_file(
3253    project_root: &Path,
3254    file: &Path,
3255    parsers: &mut HashMap<crate::parser::LangId, Parser>,
3256) -> Result<(IndexedFileMetadata, Vec<SemanticChunk>), String> {
3257    let metadata = fs::metadata(file).map_err(|error| error.to_string())?;
3258    if !metadata.is_file() {
3259        return Err("not a regular file".to_string());
3260    }
3261    let mtime = metadata.modified().map_err(|error| error.to_string())?;
3262    let size = metadata.len();
3263
3264    if !is_semantic_indexed_extension(file) {
3265        return Err("unsupported file extension".to_string());
3266    }
3267    let lang = detect_language(file).ok_or_else(|| "unsupported file extension".to_string())?;
3268
3269    let mut indexed_metadata = IndexedFileMetadata {
3270        mtime,
3271        size,
3272        content_hash: cache_freshness::zero_hash(),
3273    };
3274
3275    // OOM backstop: skip oversized files before the read + parse (tracked with
3276    // zero chunks by the caller, so freshness won't re-read them every refresh).
3277    if size > MAX_SEMANTIC_FILE_BYTES {
3278        return Ok((indexed_metadata, Vec::new()));
3279    }
3280
3281    let source = fs::read_to_string(file).map_err(|error| error.to_string())?;
3282    indexed_metadata.content_hash = if size <= cache_freshness::CONTENT_HASH_SIZE_CAP {
3283        cache_freshness::hash_bytes(source.as_bytes())
3284    } else {
3285        cache_freshness::zero_hash()
3286    };
3287
3288    let chunks = collect_file_chunks_from_source(project_root, file, lang, parsers, &source)?;
3289    Ok((indexed_metadata, chunks))
3290}
3291
3292#[cfg(test)]
3293fn collect_file_chunks(
3294    project_root: &Path,
3295    file: &Path,
3296    parsers: &mut HashMap<crate::parser::LangId, Parser>,
3297) -> Result<Vec<SemanticChunk>, String> {
3298    if !is_semantic_indexed_extension(file) {
3299        return Err("unsupported file extension".to_string());
3300    }
3301    let lang = detect_language(file).ok_or_else(|| "unsupported file extension".to_string())?;
3302    // OOM backstop: skip oversized files before the read + parse (tracked with
3303    // zero chunks by the caller, so freshness won't re-read them every refresh).
3304    if fs::metadata(file).is_ok_and(|m| m.len() > MAX_SEMANTIC_FILE_BYTES) {
3305        return Ok(Vec::new());
3306    }
3307    let source = fs::read_to_string(file).map_err(|error| error.to_string())?;
3308    collect_file_chunks_from_source(project_root, file, lang, parsers, &source)
3309}
3310
3311fn collect_file_chunks_from_source(
3312    project_root: &Path,
3313    file: &Path,
3314    lang: crate::parser::LangId,
3315    parsers: &mut HashMap<crate::parser::LangId, Parser>,
3316    source: &str,
3317) -> Result<Vec<SemanticChunk>, String> {
3318    let tree = parser_for(parsers, lang)?
3319        .parse(source, None)
3320        .ok_or_else(|| format!("tree-sitter parse returned None for {}", file.display()))?;
3321    let symbols =
3322        extract_symbols_from_tree(source, &tree, lang).map_err(|error| error.to_string())?;
3323
3324    Ok(symbols_to_chunks(file, &symbols, source, project_root))
3325}
3326
3327/// Build a display snippet from a symbol's source
3328fn build_snippet_with_lines(symbol: &Symbol, line_cache: &SourceLineCache<'_>) -> String {
3329    let start = (symbol.range.start_line as usize).min(line_cache.len());
3330    // range.end_line is inclusive 0-based; +1 makes it an exclusive slice bound.
3331    let end = (symbol.range.end_line as usize + 1).min(line_cache.len());
3332    if start < end {
3333        let snippet_lines: Vec<&str> = line_cache.lines[start..end]
3334            .iter()
3335            .take(5)
3336            .copied()
3337            .collect();
3338        let mut snippet = snippet_lines.join("\n");
3339        if end - start > 5 {
3340            snippet.push_str("\n  ...");
3341        }
3342        if snippet.len() > 300 {
3343            snippet = format!("{}...", &snippet[..snippet.floor_char_boundary(300)]);
3344        }
3345        snippet
3346    } else {
3347        String::new()
3348    }
3349}
3350
3351#[cfg(test)]
3352fn build_snippet(symbol: &Symbol, source: &str) -> String {
3353    let line_cache = SourceLineCache::new(source);
3354    build_snippet_with_lines(symbol, &line_cache)
3355}
3356
3357/// Convert symbols to semantic chunks with enriched context
3358fn symbols_to_chunks(
3359    file: &Path,
3360    symbols: &[Symbol],
3361    source: &str,
3362    project_root: &Path,
3363) -> Vec<SemanticChunk> {
3364    let line_cache = SourceLineCache::new(source);
3365    let mut chunks = Vec::new();
3366    let top_exports_with_signatures = symbols
3367        .iter()
3368        .filter(|symbol| {
3369            symbol.exported
3370                && symbol.parent.is_none()
3371                && !matches!(symbol.kind, SymbolKind::Heading)
3372        })
3373        .map(|symbol| (symbol.name.as_str(), symbol.signature.as_deref()))
3374        .collect::<Vec<_>>();
3375
3376    let has_only_headings = !symbols.is_empty()
3377        && symbols
3378            .iter()
3379            .all(|symbol| matches!(symbol.kind, SymbolKind::Heading));
3380    if top_exports_with_signatures.len() <= 2 && !has_only_headings {
3381        let top_exports = top_exports_with_signatures
3382            .iter()
3383            .map(|(name, _)| *name)
3384            .collect::<Vec<_>>();
3385        let top_export_signatures = top_exports_with_signatures
3386            .iter()
3387            .map(|(_, signature)| *signature)
3388            .collect::<Vec<_>>();
3389        chunks.push(build_file_summary_chunk_with_lines(
3390            file,
3391            project_root,
3392            &line_cache,
3393            &top_exports,
3394            &top_export_signatures,
3395        ));
3396    }
3397
3398    for symbol in symbols {
3399        // Skip Markdown / HTML heading chunks: empirically they dominate result
3400        // lists even for code-shaped queries because heading prose embeds well.
3401        // Agents querying for code lose the actual matches under doc noise.
3402        // README/docs queries are still served by grep on the same files.
3403        if matches!(symbol.kind, SymbolKind::Heading) {
3404            continue;
3405        }
3406
3407        // Skip very small symbols (single-line variables, etc.)
3408        let line_count = symbol
3409            .range
3410            .end_line
3411            .saturating_sub(symbol.range.start_line)
3412            + 1;
3413        if line_count < 2 && !matches!(symbol.kind, SymbolKind::Variable) {
3414            continue;
3415        }
3416
3417        let embed_text = build_embed_text_with_lines(symbol, &line_cache, file, project_root);
3418        let snippet = build_snippet_with_lines(symbol, &line_cache);
3419
3420        chunks.push(SemanticChunk {
3421            file: file.to_path_buf(),
3422            name: symbol.name.clone(),
3423            kind: symbol.kind.clone(),
3424            start_line: symbol.range.start_line,
3425            end_line: symbol.range.end_line,
3426            exported: symbol.exported,
3427            embed_text,
3428            snippet,
3429        });
3430
3431        // Note: Nested symbols are handled separately by the outline system
3432        // Each symbol is indexed individually
3433    }
3434
3435    chunks
3436}
3437
3438fn semantic_score_order(a: &(f32, usize), b: &(f32, usize)) -> std::cmp::Ordering {
3439    b.0.partial_cmp(&a.0)
3440        .unwrap_or(std::cmp::Ordering::Equal)
3441        .then_with(|| a.1.cmp(&b.1))
3442}
3443
3444/// Cosine similarity between two vectors
3445fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
3446    if a.len() != b.len() {
3447        return 0.0;
3448    }
3449
3450    let mut dot = 0.0f32;
3451    let mut norm_a = 0.0f32;
3452    let mut norm_b = 0.0f32;
3453
3454    for i in 0..a.len() {
3455        dot += a[i] * b[i];
3456        norm_a += a[i] * a[i];
3457        norm_b += b[i] * b[i];
3458    }
3459
3460    let denom = norm_a.sqrt() * norm_b.sqrt();
3461    if denom == 0.0 {
3462        0.0
3463    } else {
3464        dot / denom
3465    }
3466}
3467
3468// Serialization helpers
3469fn symbol_kind_to_u8(kind: &SymbolKind) -> u8 {
3470    match kind {
3471        SymbolKind::Function => 0,
3472        SymbolKind::Class => 1,
3473        SymbolKind::Method => 2,
3474        SymbolKind::Struct => 3,
3475        SymbolKind::Interface => 4,
3476        SymbolKind::Enum => 5,
3477        SymbolKind::TypeAlias => 6,
3478        SymbolKind::Variable => 7,
3479        SymbolKind::Heading => 8,
3480        SymbolKind::FileSummary => 9,
3481    }
3482}
3483
3484fn u8_to_symbol_kind(v: u8) -> SymbolKind {
3485    match v {
3486        0 => SymbolKind::Function,
3487        1 => SymbolKind::Class,
3488        2 => SymbolKind::Method,
3489        3 => SymbolKind::Struct,
3490        4 => SymbolKind::Interface,
3491        5 => SymbolKind::Enum,
3492        6 => SymbolKind::TypeAlias,
3493        7 => SymbolKind::Variable,
3494        8 => SymbolKind::Heading,
3495        9 => SymbolKind::FileSummary,
3496        _ => SymbolKind::Heading,
3497    }
3498}
3499
3500#[cfg(test)]
3501mod tests {
3502    use super::*;
3503    use crate::config::{SemanticBackend, SemanticBackendConfig};
3504    use crate::parser::FileParser;
3505    use std::io::{Read, Write};
3506    use std::net::TcpListener;
3507    use std::thread;
3508
3509    #[test]
3510    fn semantic_index_includes_php_inc_and_scss_extensions() {
3511        for file in ["partial.inc", "index.php", "styles.scss"] {
3512            assert!(
3513                is_semantic_indexed_extension(Path::new(file)),
3514                "{file} should be semantic-index eligible"
3515            );
3516        }
3517    }
3518
3519    #[test]
3520    fn transient_marker_round_trips_and_classifies() {
3521        // A marked transient error is recognized and the marker is stripped for
3522        // display, leaving a clean message.
3523        let marked = format!("{TRANSIENT_EMBEDDING_MARKER}openai compatible request failed: error sending request for url (http://localhost:1234/v1/embeddings)");
3524        assert!(embedding_failure_is_transient(&marked));
3525        let clean = strip_transient_embedding_marker(&marked);
3526        assert!(!clean.contains(TRANSIENT_EMBEDDING_MARKER));
3527        assert!(clean.starts_with("openai compatible request failed:"));
3528
3529        // Permanent errors (HTTP 4xx, dimension mismatch) carry no marker and
3530        // are not classified transient — they must fail fast.
3531        for permanent in [
3532            "openai compatible request failed (HTTP 401): Unauthorized",
3533            "embedding dimension mismatch: index has 384, model returned 768",
3534            "too many files (>20000) for semantic indexing (max 20000)",
3535        ] {
3536            assert!(
3537                !embedding_failure_is_transient(permanent),
3538                "{permanent:?} must not be transient"
3539            );
3540            // Stripping a marker-free string is a no-op.
3541            assert_eq!(strip_transient_embedding_marker(permanent), permanent);
3542        }
3543    }
3544
3545    #[test]
3546    fn send_error_transience_separates_connect_timeout_from_4xx() {
3547        // 5xx / 429 are transient; other client errors are not.
3548        assert!(is_retryable_embedding_status(
3549            reqwest::StatusCode::INTERNAL_SERVER_ERROR
3550        ));
3551        assert!(is_retryable_embedding_status(
3552            reqwest::StatusCode::TOO_MANY_REQUESTS
3553        ));
3554        assert!(!is_retryable_embedding_status(
3555            reqwest::StatusCode::UNAUTHORIZED
3556        ));
3557        assert!(!is_retryable_embedding_status(
3558            reqwest::StatusCode::BAD_REQUEST
3559        ));
3560    }
3561
3562    #[test]
3563    fn local_backend_model_loading_body_is_transient() {
3564        // LM Studio / Ollama return a 4xx with a loading/unloaded message while
3565        // the model swaps; these must classify transient so the build self-heals.
3566        for body in [
3567            r#"{"error":"Model was unloaded while the request was still in queue.."}"#,
3568            r#"{"error":"model is loading, please wait"}"#,
3569            r#"{"error":"Model not loaded"}"#,
3570            "Loading model into memory",
3571        ] {
3572            assert!(
3573                embedding_response_body_is_transient(reqwest::StatusCode::BAD_REQUEST, body),
3574                "{body:?} should be body-transient"
3575            );
3576        }
3577
3578        // A genuine 4xx misconfiguration body must NOT be treated as transient,
3579        // even when it happens to contain generic words from the old broad
3580        // substring matcher.
3581        for body in [
3582            r#"{"error":"invalid api key"}"#,
3583            r#"{"error":"model 'foo' not found"}"#,
3584            "Bad Request: unknown field",
3585            "Bad Request: invalid loading model option",
3586            r#"{"error":"unauthorized while model is being loaded by another account"}"#,
3587        ] {
3588            assert!(
3589                !embedding_response_body_is_transient(reqwest::StatusCode::BAD_REQUEST, body),
3590                "{body:?} must not be body-transient"
3591            );
3592        }
3593
3594        assert!(
3595            !embedding_response_body_is_transient(
3596                reqwest::StatusCode::UNAUTHORIZED,
3597                r#"{"error":"model is loading, please wait"}"#
3598            ),
3599            "permanent auth failures must not become transient because of body text"
3600        );
3601    }
3602
3603    fn start_mock_http_server<F>(handler: F) -> (String, thread::JoinHandle<()>)
3604    where
3605        F: Fn(String, String, String) -> String + Send + 'static,
3606    {
3607        let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
3608        let addr = listener.local_addr().expect("local addr");
3609        let handle = thread::spawn(move || {
3610            let (mut stream, _) = listener.accept().expect("accept request");
3611            let mut buf = Vec::new();
3612            let mut chunk = [0u8; 4096];
3613            let mut header_end = None;
3614            let mut content_length = 0usize;
3615            loop {
3616                let n = stream.read(&mut chunk).expect("read request");
3617                if n == 0 {
3618                    break;
3619                }
3620                buf.extend_from_slice(&chunk[..n]);
3621                if header_end.is_none() {
3622                    if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
3623                        header_end = Some(pos + 4);
3624                        let headers = String::from_utf8_lossy(&buf[..pos + 4]);
3625                        for line in headers.lines() {
3626                            if let Some(value) = line.strip_prefix("Content-Length:") {
3627                                content_length = value.trim().parse::<usize>().unwrap_or(0);
3628                            }
3629                        }
3630                    }
3631                }
3632                if let Some(end) = header_end {
3633                    if buf.len() >= end + content_length {
3634                        break;
3635                    }
3636                }
3637            }
3638
3639            let end = header_end.expect("header terminator");
3640            let request = String::from_utf8_lossy(&buf[..end]).to_string();
3641            let body = String::from_utf8_lossy(&buf[end..end + content_length]).to_string();
3642            let mut lines = request.lines();
3643            let request_line = lines.next().expect("request line").to_string();
3644            let path = request_line
3645                .split_whitespace()
3646                .nth(1)
3647                .expect("request path")
3648                .to_string();
3649            let response_body = handler(request_line, path, body);
3650            let response = format!(
3651                "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
3652                response_body.len(),
3653                response_body
3654            );
3655            stream
3656                .write_all(response.as_bytes())
3657                .expect("write response");
3658        });
3659
3660        (format!("http://{}", addr), handle)
3661    }
3662
3663    fn start_truncated_body_server(attempts: usize) -> (String, thread::JoinHandle<()>) {
3664        let listener = TcpListener::bind("127.0.0.1:0").expect("bind truncated test server");
3665        listener
3666            .set_nonblocking(true)
3667            .expect("nonblocking listener");
3668        let addr = listener.local_addr().expect("local addr");
3669        let handle = thread::spawn(move || {
3670            let deadline = std::time::Instant::now() + Duration::from_secs(2);
3671            let mut accepted = 0usize;
3672            while accepted < attempts && std::time::Instant::now() < deadline {
3673                match listener.accept() {
3674                    Ok((mut stream, _)) => {
3675                        accepted += 1;
3676                        let mut buf = [0u8; 4096];
3677                        // The client (under test) uses a 250ms timeout and drops
3678                        // the connection when the truncated body never completes.
3679                        // On Windows that disconnect surfaces as a hard socket
3680                        // error (WSAECONNRESET) on these read/write calls, where
3681                        // Unix returns a clean EOF. Tolerate both: the mock does
3682                        // not need the request bytes, and a write to an
3683                        // already-hung-up client is expected.
3684                        let _ = stream.read(&mut buf);
3685                        let response = "HTTP/1.1 200 OK
3686Content-Type: application/json
3687Content-Length: 128
3688Connection: close
3689
3690{";
3691                        let _ = stream.write_all(response.as_bytes());
3692                    }
3693                    Err(error) if error.kind() == std::io::ErrorKind::WouldBlock => {
3694                        thread::sleep(Duration::from_millis(10));
3695                    }
3696                    Err(error) => panic!("accept request: {error}"),
3697                }
3698            }
3699        });
3700
3701        (format!("http://{}", addr), handle)
3702    }
3703
3704    #[test]
3705    fn response_body_read_failures_are_marked_transient() {
3706        let (url, handle) = start_truncated_body_server(EMBEDDING_REQUEST_MAX_ATTEMPTS);
3707        let client = Client::builder()
3708            .timeout(Duration::from_millis(250))
3709            .build()
3710            .expect("client");
3711
3712        let error = send_embedding_request(|| client.post(&url).body("{}"), "test backend")
3713            .expect_err("truncated body should fail");
3714
3715        handle.join().unwrap();
3716        assert!(
3717            embedding_failure_is_transient(&error),
3718            "body read failures should be transient-marked: {error}"
3719        );
3720        assert!(error.contains("response read failed"));
3721    }
3722
3723    fn test_vector_for_texts(texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
3724        Ok(texts.iter().map(|_| vec![1.0, 0.0, 0.0]).collect())
3725    }
3726
3727    fn write_rust_file(path: &Path, function_name: &str) {
3728        fs::write(
3729            path,
3730            format!("pub fn {function_name}() -> bool {{\n    true\n}}\n"),
3731        )
3732        .unwrap();
3733    }
3734
3735    fn build_test_index(project_root: &Path, files: &[PathBuf]) -> SemanticIndex {
3736        let mut embed = test_vector_for_texts;
3737        SemanticIndex::build(project_root, files, &mut embed, 8).unwrap()
3738    }
3739
3740    fn test_project_root() -> PathBuf {
3741        std::env::current_dir().unwrap()
3742    }
3743
3744    fn set_file_metadata(index: &mut SemanticIndex, file: &Path, mtime: SystemTime, size: u64) {
3745        index.file_mtimes.insert(file.to_path_buf(), mtime);
3746        index.file_sizes.insert(file.to_path_buf(), size);
3747        index
3748            .file_hashes
3749            .insert(file.to_path_buf(), cache_freshness::zero_hash());
3750    }
3751
3752    fn legacy_semantic_index_bytes(index: &SemanticIndex) -> Vec<u8> {
3753        let mut buf = Vec::new();
3754        let fingerprint_bytes = index.fingerprint.as_ref().and_then(|fingerprint| {
3755            let encoded = fingerprint.as_string();
3756            if encoded.is_empty() {
3757                None
3758            } else {
3759                Some(encoded.into_bytes())
3760            }
3761        });
3762        let file_mtimes: Vec<_> = index
3763            .file_mtimes
3764            .iter()
3765            .filter_map(|(path, mtime)| {
3766                cache_relative_path(&index.project_root, path)
3767                    .map(|relative| (relative, path, mtime))
3768            })
3769            .collect();
3770        let entries: Vec<_> = index
3771            .entries
3772            .iter()
3773            .filter_map(|entry| {
3774                cache_relative_path(&index.project_root, &entry.chunk.file)
3775                    .map(|relative| (relative, entry))
3776            })
3777            .collect();
3778
3779        buf.push(SEMANTIC_INDEX_VERSION_V6);
3780        buf.extend_from_slice(&(index.dimension as u32).to_le_bytes());
3781        buf.extend_from_slice(&(entries.len() as u32).to_le_bytes());
3782        let fp_bytes_ref: &[u8] = fingerprint_bytes.as_deref().unwrap_or(&[]);
3783        buf.extend_from_slice(&(fp_bytes_ref.len() as u32).to_le_bytes());
3784        buf.extend_from_slice(fp_bytes_ref);
3785
3786        buf.extend_from_slice(&(file_mtimes.len() as u32).to_le_bytes());
3787        for (relative, path, mtime) in &file_mtimes {
3788            let path_bytes = relative.to_string_lossy().as_bytes().to_vec();
3789            buf.extend_from_slice(&(path_bytes.len() as u32).to_le_bytes());
3790            buf.extend_from_slice(&path_bytes);
3791            let duration = mtime
3792                .duration_since(SystemTime::UNIX_EPOCH)
3793                .unwrap_or_default();
3794            buf.extend_from_slice(&duration.as_secs().to_le_bytes());
3795            buf.extend_from_slice(&duration.subsec_nanos().to_le_bytes());
3796            let size = index.file_sizes.get(*path).copied().unwrap_or_default();
3797            buf.extend_from_slice(&size.to_le_bytes());
3798            let hash = index
3799                .file_hashes
3800                .get(*path)
3801                .copied()
3802                .unwrap_or_else(cache_freshness::zero_hash);
3803            buf.extend_from_slice(hash.as_bytes());
3804        }
3805
3806        for (relative, entry) in &entries {
3807            let c = &entry.chunk;
3808            let file_bytes = relative.to_string_lossy().as_bytes().to_vec();
3809            buf.extend_from_slice(&(file_bytes.len() as u32).to_le_bytes());
3810            buf.extend_from_slice(&file_bytes);
3811
3812            let name_bytes = c.name.as_bytes();
3813            buf.extend_from_slice(&(name_bytes.len() as u32).to_le_bytes());
3814            buf.extend_from_slice(name_bytes);
3815
3816            buf.push(symbol_kind_to_u8(&c.kind));
3817            buf.extend_from_slice(&(c.start_line as u32).to_le_bytes());
3818            buf.extend_from_slice(&(c.end_line as u32).to_le_bytes());
3819            buf.push(c.exported as u8);
3820
3821            let snippet_bytes = c.snippet.as_bytes();
3822            buf.extend_from_slice(&(snippet_bytes.len() as u32).to_le_bytes());
3823            buf.extend_from_slice(snippet_bytes);
3824
3825            let embed_bytes = c.embed_text.as_bytes();
3826            buf.extend_from_slice(&(embed_bytes.len() as u32).to_le_bytes());
3827            buf.extend_from_slice(embed_bytes);
3828
3829            for &val in &entry.vector {
3830                buf.extend_from_slice(&val.to_le_bytes());
3831            }
3832        }
3833
3834        buf
3835    }
3836
3837    #[derive(Default)]
3838    struct RecordingEmbedder {
3839        calls: Vec<Vec<String>>,
3840    }
3841
3842    impl RecordingEmbedder {
3843        fn embed(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
3844            let vectors = texts
3845                .iter()
3846                .map(|text| deterministic_test_vector(text))
3847                .collect();
3848            self.calls.push(texts);
3849            Ok(vectors)
3850        }
3851
3852        fn total_embedded_texts(&self) -> usize {
3853            self.calls.iter().map(Vec::len).sum()
3854        }
3855
3856        fn embedded_texts(&self) -> Vec<&str> {
3857            self.calls
3858                .iter()
3859                .flat_map(|batch| batch.iter().map(String::as_str))
3860                .collect()
3861        }
3862    }
3863
3864    fn deterministic_test_vector(text: &str) -> Vec<f32> {
3865        let hash = blake3::hash(text.as_bytes());
3866        let bytes = hash.as_bytes();
3867        vec![
3868            1.0,
3869            bytes[0] as f32 / 255.0,
3870            bytes[1] as f32 / 255.0,
3871            bytes[2] as f32 / 255.0,
3872        ]
3873    }
3874
3875    fn build_recorded_test_index(project_root: &Path, files: &[PathBuf]) -> SemanticIndex {
3876        let mut embedder = RecordingEmbedder::default();
3877        let mut embed = |texts: Vec<String>| embedder.embed(texts);
3878        SemanticIndex::build(project_root, files, &mut embed, 16).unwrap()
3879    }
3880
3881    fn force_stale(index: &mut SemanticIndex, file: &Path) {
3882        set_file_metadata(index, file, SystemTime::UNIX_EPOCH, 0);
3883    }
3884
3885    fn write_source(path: &Path, source: &str) {
3886        if let Some(parent) = path.parent() {
3887            fs::create_dir_all(parent).unwrap();
3888        }
3889        fs::write(path, source).unwrap();
3890    }
3891
3892    fn entries_for_file<'a>(index: &'a SemanticIndex, file: &Path) -> Vec<&'a EmbeddingEntry> {
3893        index
3894            .entries
3895            .iter()
3896            .filter(|entry| entry.chunk.file == file)
3897            .collect()
3898    }
3899
3900    fn entry_by_name<'a>(index: &'a SemanticIndex, file: &Path, name: &str) -> &'a EmbeddingEntry {
3901        index
3902            .entries
3903            .iter()
3904            .find(|entry| entry.chunk.file == file && entry.chunk.name == name)
3905            .unwrap_or_else(|| panic!("missing semantic entry {name} in {}", file.display()))
3906    }
3907
3908    fn file_summary_entry<'a>(index: &'a SemanticIndex, file: &Path) -> &'a EmbeddingEntry {
3909        index
3910            .entries
3911            .iter()
3912            .find(|entry| entry.chunk.file == file && entry.chunk.kind == SymbolKind::FileSummary)
3913            .unwrap_or_else(|| panic!("missing file-summary entry in {}", file.display()))
3914    }
3915
3916    #[test]
3917    fn refresh_stale_line_shift_reuses_all_chunks_and_retains_entries() {
3918        let temp = tempfile::tempdir().unwrap();
3919        let project_root = temp.path();
3920        let file = project_root.join("src/lib.rs");
3921        let original = "pub fn alpha() -> i32 {\n    1\n}\n\npub fn beta() -> i32 {\n    2\n}\n";
3922        write_source(&file, original);
3923
3924        let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
3925        let original_entry_count = index.entries.len();
3926        let original_alpha_vector = entry_by_name(&index, &file, "alpha").vector.clone();
3927
3928        write_source(&file, &format!("\n{original}"));
3929        force_stale(&mut index, &file);
3930
3931        let mut embedder = RecordingEmbedder::default();
3932        let mut embed = |texts: Vec<String>| embedder.embed(texts);
3933        let mut progress = |_done: usize, _total: usize| {};
3934        let summary = index
3935            .refresh_stale_files(
3936                project_root,
3937                std::slice::from_ref(&file),
3938                &mut embed,
3939                16,
3940                &mut progress,
3941            )
3942            .unwrap();
3943
3944        assert_eq!(summary.changed, 1);
3945        assert_eq!(embedder.total_embedded_texts(), 0);
3946        assert_eq!(index.entries.len(), original_entry_count);
3947        let shifted_alpha = entry_by_name(&index, &file, "alpha");
3948        assert_eq!(shifted_alpha.chunk.start_line, 1);
3949        assert_eq!(shifted_alpha.vector, original_alpha_vector);
3950    }
3951
3952    #[test]
3953    fn refresh_invalidated_line_shift_emits_full_replacement_delta_for_apply() {
3954        let temp = tempfile::tempdir().unwrap();
3955        let project_root = temp.path();
3956        let file = project_root.join("src/lib.rs");
3957        let original = "pub fn alpha() -> i32 {\n    1\n}\n\npub fn beta() -> i32 {\n    2\n}\n";
3958        write_source(&file, original);
3959
3960        let mut worker_index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
3961        let mut serving_index = worker_index.clone();
3962        let original_entry_count = worker_index.entries.len();
3963
3964        write_source(&file, &format!("\n{original}"));
3965
3966        let mut embedder = RecordingEmbedder::default();
3967        let mut embed = |texts: Vec<String>| embedder.embed(texts);
3968        let mut progress = |_done: usize, _total: usize| {};
3969        let update = worker_index
3970            .refresh_invalidated_files(
3971                project_root,
3972                std::slice::from_ref(&file),
3973                &mut embed,
3974                16,
3975                100,
3976                &mut progress,
3977            )
3978            .unwrap();
3979
3980        assert_eq!(embedder.total_embedded_texts(), 0);
3981        assert_eq!(update.added_entries.len(), original_entry_count);
3982        assert_eq!(worker_index.entries.len(), original_entry_count);
3983
3984        serving_index.apply_refresh_update(
3985            update.added_entries,
3986            update.updated_metadata,
3987            &update.completed_paths,
3988        );
3989
3990        assert_eq!(serving_index.entries.len(), original_entry_count);
3991        assert_eq!(
3992            entries_for_file(&serving_index, &file).len(),
3993            original_entry_count
3994        );
3995        assert_eq!(
3996            entry_by_name(&serving_index, &file, "alpha")
3997                .chunk
3998                .start_line,
3999            1
4000        );
4001    }
4002
4003    #[test]
4004    fn refresh_invalidated_one_symbol_edit_embeds_only_changed_symbol() {
4005        let temp = tempfile::tempdir().unwrap();
4006        let project_root = temp.path();
4007        let file = project_root.join("src/lib.rs");
4008        write_source(
4009            &file,
4010            "pub fn alpha() -> i32 {\n    1\n}\n\npub fn beta() -> i32 {\n    2\n}\n",
4011        );
4012
4013        let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
4014        let original_entry_count = index.entries.len();
4015        let beta_vector = entry_by_name(&index, &file, "beta").vector.clone();
4016
4017        write_source(
4018            &file,
4019            "pub fn alpha() -> i32 {\n    10\n}\n\npub fn beta() -> i32 {\n    2\n}\n",
4020        );
4021
4022        let mut embedder = RecordingEmbedder::default();
4023        let mut embed = |texts: Vec<String>| embedder.embed(texts);
4024        let mut progress = |_done: usize, _total: usize| {};
4025        let update = index
4026            .refresh_invalidated_files(
4027                project_root,
4028                std::slice::from_ref(&file),
4029                &mut embed,
4030                16,
4031                100,
4032                &mut progress,
4033            )
4034            .unwrap();
4035
4036        assert_eq!(embedder.total_embedded_texts(), 1);
4037        assert!(embedder.embedded_texts()[0].contains("name:alpha"));
4038        assert_eq!(update.added_entries.len(), original_entry_count);
4039        assert_eq!(entry_by_name(&index, &file, "beta").vector, beta_vector);
4040    }
4041
4042    #[test]
4043    fn refresh_reuses_one_old_vector_for_two_byte_identical_symbols() {
4044        let temp = tempfile::tempdir().unwrap();
4045        let project_root = temp.path();
4046        let file = project_root.join("src/dupe.js");
4047        let one_duplicate = "function duplicate() {\n  return 1;\n}\n";
4048        write_source(&file, one_duplicate);
4049
4050        let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
4051        let original_vector = entry_by_name(&index, &file, "duplicate").vector.clone();
4052
4053        write_source(&file, &format!("{one_duplicate}\n{one_duplicate}"));
4054
4055        let mut embedder = RecordingEmbedder::default();
4056        let mut embed = |texts: Vec<String>| embedder.embed(texts);
4057        let mut progress = |_done: usize, _total: usize| {};
4058        index
4059            .refresh_invalidated_files(
4060                project_root,
4061                std::slice::from_ref(&file),
4062                &mut embed,
4063                16,
4064                100,
4065                &mut progress,
4066            )
4067            .unwrap();
4068
4069        let duplicate_entries = index
4070            .entries
4071            .iter()
4072            .filter(|entry| entry.chunk.file == file && entry.chunk.name == "duplicate")
4073            .collect::<Vec<_>>();
4074        assert_eq!(duplicate_entries.len(), 2);
4075        assert_eq!(embedder.total_embedded_texts(), 0);
4076        assert_eq!(duplicate_entries[0].vector, original_vector);
4077        assert_eq!(duplicate_entries[1].vector, original_vector);
4078    }
4079
4080    #[test]
4081    fn file_summary_reuses_on_body_edit_and_misses_on_leading_doc_edit() {
4082        let temp = tempfile::tempdir().unwrap();
4083        let project_root = temp.path();
4084        let file = project_root.join("src/lib.rs");
4085        write_source(
4086            &file,
4087            "//! module docs v1\n\npub fn alpha() -> i32 {\n    1\n}\n",
4088        );
4089
4090        let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
4091        let summary_before = file_summary_entry(&index, &file).vector.clone();
4092
4093        write_source(
4094            &file,
4095            "//! module docs v1\n\npub fn alpha() -> i32 {\n    2\n}\n",
4096        );
4097        let mut body_embedder = RecordingEmbedder::default();
4098        let mut body_embed = |texts: Vec<String>| body_embedder.embed(texts);
4099        let mut progress = |_done: usize, _total: usize| {};
4100        index
4101            .refresh_invalidated_files(
4102                project_root,
4103                std::slice::from_ref(&file),
4104                &mut body_embed,
4105                16,
4106                100,
4107                &mut progress,
4108            )
4109            .unwrap();
4110        assert_eq!(body_embedder.total_embedded_texts(), 1);
4111        assert!(body_embedder.embedded_texts()[0].contains("name:alpha"));
4112        assert_eq!(file_summary_entry(&index, &file).vector, summary_before);
4113
4114        write_source(
4115            &file,
4116            "//! module docs v2\n\npub fn alpha() -> i32 {\n    2\n}\n",
4117        );
4118        let mut doc_embedder = RecordingEmbedder::default();
4119        let mut doc_embed = |texts: Vec<String>| doc_embedder.embed(texts);
4120        index
4121            .refresh_invalidated_files(
4122                project_root,
4123                std::slice::from_ref(&file),
4124                &mut doc_embed,
4125                16,
4126                100,
4127                &mut progress,
4128            )
4129            .unwrap();
4130
4131        assert_eq!(doc_embedder.total_embedded_texts(), 1);
4132        assert!(doc_embedder.embedded_texts()[0].contains("kind:file-summary"));
4133        assert_ne!(file_summary_entry(&index, &file).vector, summary_before);
4134    }
4135
4136    #[test]
4137    fn refresh_invalidated_deleted_file_drops_entries_without_embedding() {
4138        let temp = tempfile::tempdir().unwrap();
4139        let project_root = temp.path();
4140        let file = project_root.join("src/lib.rs");
4141        write_source(&file, "pub fn alpha() -> i32 {\n    1\n}\n");
4142
4143        let mut worker_index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
4144        let mut serving_index = worker_index.clone();
4145        fs::remove_file(&file).unwrap();
4146
4147        let mut embedder = RecordingEmbedder::default();
4148        let mut embed = |texts: Vec<String>| embedder.embed(texts);
4149        let mut progress = |_done: usize, _total: usize| {};
4150        let update = worker_index
4151            .refresh_invalidated_files(
4152                project_root,
4153                std::slice::from_ref(&file),
4154                &mut embed,
4155                16,
4156                100,
4157                &mut progress,
4158            )
4159            .unwrap();
4160
4161        assert_eq!(update.summary.deleted, 1);
4162        assert_eq!(embedder.total_embedded_texts(), 0);
4163        assert!(worker_index.entries.is_empty());
4164
4165        serving_index.apply_refresh_update(
4166            update.added_entries,
4167            update.updated_metadata,
4168            &update.completed_paths,
4169        );
4170        assert!(serving_index.entries.is_empty());
4171    }
4172
4173    #[test]
4174    fn watcher_collect_failure_does_not_resurrect_stale_entries() {
4175        let temp = tempfile::tempdir().unwrap();
4176        let project_root = temp.path();
4177        let file = project_root.join("src/lib.rs");
4178        write_source(&file, "pub fn alpha() -> i32 {\n    1\n}\n");
4179
4180        let mut worker_index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
4181        let mut serving_index = worker_index.clone();
4182        fs::write(&file, [0xff, 0xfe, 0xfd]).unwrap();
4183
4184        let mut embedder = RecordingEmbedder::default();
4185        let mut embed = |texts: Vec<String>| embedder.embed(texts);
4186        let mut progress = |_done: usize, _total: usize| {};
4187        let update = worker_index
4188            .refresh_invalidated_files(
4189                project_root,
4190                std::slice::from_ref(&file),
4191                &mut embed,
4192                16,
4193                100,
4194                &mut progress,
4195            )
4196            .unwrap();
4197
4198        assert_eq!(embedder.total_embedded_texts(), 0);
4199        assert!(update.added_entries.is_empty());
4200        assert!(worker_index.entries.is_empty());
4201        assert!(!worker_index.file_mtimes.contains_key(&file));
4202
4203        serving_index.apply_refresh_update(
4204            update.added_entries,
4205            update.updated_metadata,
4206            &update.completed_paths,
4207        );
4208        assert!(serving_index.entries.is_empty());
4209        assert!(!serving_index.file_mtimes.contains_key(&file));
4210    }
4211
4212    #[test]
4213    fn refresh_invalidated_cap_deferral_remains_file_count_based() {
4214        let temp = tempfile::tempdir().unwrap();
4215        let project_root = temp.path();
4216        let indexed = project_root.join("src/a.rs");
4217        let deferred = project_root.join("src/b.rs");
4218        write_source(&indexed, "pub fn alpha() -> i32 {\n    1\n}\n");
4219        write_source(&deferred, "pub fn beta() -> i32 {\n    2\n}\n");
4220
4221        let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&indexed));
4222        let mut embedder = RecordingEmbedder::default();
4223        let mut embed = |texts: Vec<String>| embedder.embed(texts);
4224        let mut progress = |_done: usize, _total: usize| {};
4225        let update = index
4226            .refresh_invalidated_files(
4227                project_root,
4228                std::slice::from_ref(&deferred),
4229                &mut embed,
4230                16,
4231                1,
4232                &mut progress,
4233            )
4234            .unwrap();
4235
4236        assert_eq!(update.summary.total_processed, 1);
4237        assert_eq!(update.summary.added, 0);
4238        assert_eq!(embedder.total_embedded_texts(), 0);
4239        assert_eq!(index.indexed_file_count(), 1);
4240        assert!(index.deferred_files.contains(&deferred));
4241        assert!(entries_for_file(&index, &deferred).is_empty());
4242    }
4243
4244    #[test]
4245    fn semantic_cache_serialization_skips_paths_outside_project_root() {
4246        let dir = tempfile::tempdir().expect("create temp dir");
4247        let project = fs::canonicalize(dir.path()).expect("canonical project");
4248        let outside = project.join("..").join("outside.rs");
4249        let mut index = SemanticIndex::new(project.clone(), 3);
4250        index
4251            .file_mtimes
4252            .insert(outside.clone(), SystemTime::UNIX_EPOCH);
4253        index.file_sizes.insert(outside.clone(), 1);
4254        index
4255            .file_hashes
4256            .insert(outside.clone(), cache_freshness::zero_hash());
4257        index.entries.push(EmbeddingEntry {
4258            chunk: SemanticChunk {
4259                file: outside,
4260                name: "outside".to_string(),
4261                kind: SymbolKind::Function,
4262                start_line: 0,
4263                end_line: 0,
4264                exported: false,
4265                embed_text: "outside".to_string(),
4266                snippet: "outside".to_string(),
4267            },
4268            vector: vec![1.0, 0.0, 0.0],
4269        });
4270
4271        let bytes = index.to_bytes();
4272        let loaded = SemanticIndex::from_bytes(&bytes, &project).expect("load serialized index");
4273        assert_eq!(loaded.entries.len(), 0);
4274        assert!(loaded.file_mtimes.is_empty());
4275    }
4276
4277    #[test]
4278    fn semantic_search_bounded_top_k_matches_reference_full_sort() {
4279        let project_root = test_project_root();
4280        let file = project_root.join("src/lib.rs");
4281        let mut index = SemanticIndex::new(project_root, 2);
4282        let entries = [
4283            ("alpha", vec![1.0, 0.0], false),
4284            ("beta", vec![0.0, 1.0], false),
4285            ("gamma", vec![1.0, 0.0], false),
4286            ("delta", vec![0.5, 0.5], true),
4287            ("epsilon", vec![-1.0, 0.0], false),
4288        ];
4289        for (line, (name, vector, exported)) in entries.into_iter().enumerate() {
4290            index.entries.push(EmbeddingEntry {
4291                chunk: SemanticChunk {
4292                    file: file.clone(),
4293                    name: name.to_string(),
4294                    kind: SymbolKind::Function,
4295                    start_line: line as u32 + 1,
4296                    end_line: line as u32 + 1,
4297                    exported,
4298                    embed_text: name.to_string(),
4299                    snippet: format!("fn {name}() {{}}"),
4300                },
4301                vector,
4302            });
4303        }
4304
4305        let query = vec![1.0, 0.0];
4306        let top_k = 4;
4307        let mut reference: Vec<(f32, usize)> = index
4308            .entries
4309            .iter()
4310            .enumerate()
4311            .map(|(idx, entry)| {
4312                let mut score = cosine_similarity(&query, &entry.vector);
4313                if entry.chunk.exported {
4314                    score *= 1.1;
4315                }
4316                (score, idx)
4317            })
4318            .collect();
4319        reference.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
4320        let expected: Vec<(String, f32)> = reference
4321            .into_iter()
4322            .take(top_k)
4323            .map(|(score, idx)| (index.entries[idx].chunk.name.clone(), score))
4324            .collect();
4325
4326        let actual: Vec<(String, f32)> = index
4327            .search(&query, top_k)
4328            .into_iter()
4329            .map(|result| (result.name, result.score))
4330            .collect();
4331
4332        assert_eq!(
4333            actual.iter().map(|(name, _)| name).collect::<Vec<_>>(),
4334            expected.iter().map(|(name, _)| name).collect::<Vec<_>>()
4335        );
4336        for ((_, actual_score), (_, expected_score)) in actual.iter().zip(expected.iter()) {
4337            assert!((actual_score - expected_score).abs() < 1e-6);
4338        }
4339        assert_eq!(actual[0].0, "alpha");
4340        assert_eq!(actual[1].0, "gamma", "equal scores keep insertion order");
4341        assert!(index.search(&query, 0).is_empty());
4342    }
4343
4344    #[test]
4345    fn test_cosine_similarity_identical() {
4346        let a = vec![1.0, 0.0, 0.0];
4347        let b = vec![1.0, 0.0, 0.0];
4348        assert!((cosine_similarity(&a, &b) - 1.0).abs() < 0.001);
4349    }
4350
4351    #[test]
4352    fn test_cosine_similarity_orthogonal() {
4353        let a = vec![1.0, 0.0, 0.0];
4354        let b = vec![0.0, 1.0, 0.0];
4355        assert!(cosine_similarity(&a, &b).abs() < 0.001);
4356    }
4357
4358    #[test]
4359    fn test_cosine_similarity_opposite() {
4360        let a = vec![1.0, 0.0, 0.0];
4361        let b = vec![-1.0, 0.0, 0.0];
4362        assert!((cosine_similarity(&a, &b) + 1.0).abs() < 0.001);
4363    }
4364
4365    #[test]
4366    fn test_serialization_roundtrip() {
4367        let project_root = test_project_root();
4368        let file = project_root.join("src/main.rs");
4369        let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
4370        index.entries.push(EmbeddingEntry {
4371            chunk: SemanticChunk {
4372                file: file.clone(),
4373                name: "handle_request".to_string(),
4374                kind: SymbolKind::Function,
4375                start_line: 10,
4376                end_line: 25,
4377                exported: true,
4378                embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
4379                snippet: "fn handle_request() {\n  // ...\n}".to_string(),
4380            },
4381            vector: vec![0.1, 0.2, 0.3, 0.4],
4382        });
4383        index.dimension = 4;
4384        index
4385            .file_mtimes
4386            .insert(file.clone(), SystemTime::UNIX_EPOCH);
4387        index.file_sizes.insert(file, 0);
4388        index.set_fingerprint(SemanticIndexFingerprint {
4389            backend: "fastembed".to_string(),
4390            model: "all-MiniLM-L6-v2".to_string(),
4391            base_url: FALLBACK_BACKEND.to_string(),
4392            dimension: 4,
4393            chunking_version: default_chunking_version(),
4394        });
4395
4396        let bytes = index.to_bytes();
4397        let restored = SemanticIndex::from_bytes(&bytes, &project_root).unwrap();
4398
4399        assert_eq!(restored.entries.len(), 1);
4400        assert_eq!(restored.entries[0].chunk.name, "handle_request");
4401        assert_eq!(restored.entries[0].vector, vec![0.1, 0.2, 0.3, 0.4]);
4402        assert_eq!(restored.dimension, 4);
4403        assert_eq!(restored.backend_label(), Some("fastembed"));
4404        assert_eq!(restored.model_label(), Some("all-MiniLM-L6-v2"));
4405    }
4406
4407    #[test]
4408    fn semantic_cache_streaming_persistence_matches_legacy_bytes_and_round_trips() {
4409        let storage = tempfile::tempdir().expect("create storage dir");
4410        let project = storage.path().join("project");
4411        fs::create_dir_all(project.join("src")).expect("create project src");
4412        let file = project.join("src/lib.rs");
4413        fs::write(&file, "pub fn alpha() {}\npub fn beta() {}\n").expect("write source");
4414        let project_root = fs::canonicalize(&project).expect("canonical project");
4415        let file = fs::canonicalize(&file).expect("canonical file");
4416
4417        let mut index = SemanticIndex::new(project_root.clone(), 3);
4418        let mtime = SystemTime::UNIX_EPOCH + Duration::new(123, 456);
4419        index.file_mtimes.insert(file.clone(), mtime);
4420        index.file_sizes.insert(file.clone(), 42);
4421        index
4422            .file_hashes
4423            .insert(file.clone(), cache_freshness::zero_hash());
4424        index.entries.push(EmbeddingEntry {
4425            chunk: SemanticChunk {
4426                file: file.clone(),
4427                name: "alpha".to_string(),
4428                kind: SymbolKind::Function,
4429                start_line: 0,
4430                end_line: 0,
4431                exported: true,
4432                embed_text: "file:src/lib.rs kind:function name:alpha".to_string(),
4433                snippet: "pub fn alpha() {}".to_string(),
4434            },
4435            vector: vec![0.1, 0.2, 0.3],
4436        });
4437        index.entries.push(EmbeddingEntry {
4438            chunk: SemanticChunk {
4439                file: file.clone(),
4440                name: "beta".to_string(),
4441                kind: SymbolKind::Function,
4442                start_line: 1,
4443                end_line: 1,
4444                exported: true,
4445                embed_text: "file:src/lib.rs kind:function name:beta".to_string(),
4446                snippet: "pub fn beta() {}".to_string(),
4447            },
4448            vector: vec![0.4, 0.5, 0.6],
4449        });
4450        let fingerprint = SemanticIndexFingerprint {
4451            backend: "fastembed".to_string(),
4452            model: "all-MiniLM-L6-v2".to_string(),
4453            base_url: FALLBACK_BACKEND.to_string(),
4454            dimension: 3,
4455            chunking_version: default_chunking_version(),
4456        };
4457        index.set_fingerprint(fingerprint.clone());
4458
4459        let legacy_bytes = legacy_semantic_index_bytes(&index);
4460        assert_eq!(index.to_bytes(), legacy_bytes);
4461
4462        index.write_to_disk(storage.path(), "proj");
4463        let data_path = storage.path().join("semantic/proj/semantic.bin");
4464        assert_eq!(
4465            fs::read(&data_path).expect("read semantic.bin"),
4466            legacy_bytes
4467        );
4468
4469        let loaded = SemanticIndex::read_from_disk(
4470            storage.path(),
4471            "proj",
4472            &project_root,
4473            false,
4474            Some(&fingerprint.as_string()),
4475        )
4476        .expect("load semantic index");
4477        assert_eq!(loaded.entries.len(), index.entries.len());
4478        assert_eq!(loaded.dimension, index.dimension);
4479        assert_eq!(
4480            loaded.fingerprint().unwrap().as_string(),
4481            fingerprint.as_string()
4482        );
4483        assert_eq!(loaded.file_mtimes.get(&file), Some(&mtime));
4484        assert_eq!(loaded.file_sizes.get(&file), Some(&42));
4485        assert_eq!(
4486            loaded.file_hashes.get(&file),
4487            Some(&cache_freshness::zero_hash())
4488        );
4489        for (actual, expected) in loaded.entries.iter().zip(index.entries.iter()) {
4490            assert_eq!(actual.chunk.file, expected.chunk.file);
4491            assert_eq!(actual.chunk.name, expected.chunk.name);
4492            assert_eq!(actual.chunk.kind, expected.chunk.kind);
4493            assert_eq!(actual.chunk.start_line, expected.chunk.start_line);
4494            assert_eq!(actual.chunk.end_line, expected.chunk.end_line);
4495            assert_eq!(actual.chunk.exported, expected.chunk.exported);
4496            assert_eq!(actual.chunk.embed_text, expected.chunk.embed_text);
4497            assert_eq!(actual.chunk.snippet, expected.chunk.snippet);
4498            assert_eq!(actual.vector, expected.vector);
4499        }
4500        assert_eq!(loaded.to_bytes(), legacy_bytes);
4501    }
4502
4503    #[test]
4504    fn symbol_kind_serialization_roundtrip_includes_file_summary_variant() {
4505        let cases = [
4506            (SymbolKind::Function, 0),
4507            (SymbolKind::Class, 1),
4508            (SymbolKind::Method, 2),
4509            (SymbolKind::Struct, 3),
4510            (SymbolKind::Interface, 4),
4511            (SymbolKind::Enum, 5),
4512            (SymbolKind::TypeAlias, 6),
4513            (SymbolKind::Variable, 7),
4514            (SymbolKind::Heading, 8),
4515            (SymbolKind::FileSummary, 9),
4516        ];
4517
4518        for (kind, encoded) in cases {
4519            assert_eq!(symbol_kind_to_u8(&kind), encoded);
4520            assert_eq!(u8_to_symbol_kind(encoded), kind);
4521        }
4522    }
4523
4524    #[test]
4525    fn test_search_top_k() {
4526        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4527        index.dimension = 3;
4528
4529        // Add entries with known vectors
4530        for (i, name) in ["auth", "database", "handler"].iter().enumerate() {
4531            let mut vec = vec![0.0f32; 3];
4532            vec[i] = 1.0; // orthogonal vectors
4533            index.entries.push(EmbeddingEntry {
4534                chunk: SemanticChunk {
4535                    file: PathBuf::from("/src/lib.rs"),
4536                    name: name.to_string(),
4537                    kind: SymbolKind::Function,
4538                    start_line: (i * 10 + 1) as u32,
4539                    end_line: (i * 10 + 5) as u32,
4540                    exported: true,
4541                    embed_text: format!("kind:function name:{}", name),
4542                    snippet: format!("fn {}() {{}}", name),
4543                },
4544                vector: vec,
4545            });
4546        }
4547
4548        // Query aligned with "auth" (index 0)
4549        let query = vec![0.9, 0.1, 0.0];
4550        let results = index.search(&query, 2);
4551
4552        assert_eq!(results.len(), 2);
4553        assert_eq!(results[0].name, "auth"); // highest score
4554        assert!(results[0].score > results[1].score);
4555    }
4556
4557    #[test]
4558    fn test_empty_index_search() {
4559        let index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4560        let results = index.search(&[0.1, 0.2, 0.3], 10);
4561        assert!(results.is_empty());
4562    }
4563
4564    #[test]
4565    fn single_line_symbol_builds_non_empty_snippet() {
4566        let symbol = Symbol {
4567            name: "answer".to_string(),
4568            kind: SymbolKind::Variable,
4569            range: crate::symbols::Range {
4570                start_line: 0,
4571                start_col: 0,
4572                end_line: 0,
4573                end_col: 24,
4574            },
4575            signature: Some("const answer = 42".to_string()),
4576            scope_chain: Vec::new(),
4577            exported: true,
4578            parent: None,
4579        };
4580        let source = "export const answer = 42;\n";
4581
4582        let snippet = build_snippet(&symbol, source);
4583
4584        assert_eq!(snippet, "export const answer = 42;");
4585    }
4586
4587    #[test]
4588    fn optimized_file_chunk_collection_matches_file_parser_path() {
4589        let project_root = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
4590        let file = project_root.join("src/semantic_index.rs");
4591        let source = std::fs::read_to_string(&file).unwrap();
4592
4593        let mut legacy_parser = FileParser::new();
4594        let legacy_symbols = legacy_parser.extract_symbols(&file).unwrap();
4595        let legacy_chunks = symbols_to_chunks(&file, &legacy_symbols, &source, &project_root);
4596
4597        let mut parsers = HashMap::new();
4598        let optimized_chunks = collect_file_chunks(&project_root, &file, &mut parsers).unwrap();
4599
4600        assert_eq!(
4601            chunk_fingerprint(&optimized_chunks),
4602            chunk_fingerprint(&legacy_chunks)
4603        );
4604    }
4605
4606    #[test]
4607    fn collect_file_chunks_indexes_java_symbols() {
4608        let dir = tempfile::tempdir().unwrap();
4609        let file = dir.path().join("Greeter.java");
4610        std::fs::write(
4611            &file,
4612            r#"package example;
4613
4614public class Greeter {
4615    public String greet(String name) {
4616        return "Hello, " + name;
4617    }
4618}
4619"#,
4620        )
4621        .unwrap();
4622
4623        let mut parsers = HashMap::new();
4624        let chunks = collect_file_chunks(dir.path(), &file, &mut parsers).unwrap();
4625
4626        assert!(
4627            !chunks.is_empty(),
4628            "Java file should produce semantic chunks"
4629        );
4630        assert!(
4631            chunks
4632                .iter()
4633                .any(|chunk| chunk.name == "Greeter" && chunk.kind == SymbolKind::Class),
4634            "Java class symbol should be chunked: {chunks:?}"
4635        );
4636        assert!(
4637            chunks
4638                .iter()
4639                .any(|chunk| chunk.name == "greet" && chunk.kind == SymbolKind::Method),
4640            "Java method symbol should be chunked: {chunks:?}"
4641        );
4642    }
4643
4644    fn chunk_fingerprint(
4645        chunks: &[SemanticChunk],
4646    ) -> Vec<(String, SymbolKind, u32, u32, bool, String, String)> {
4647        chunks
4648            .iter()
4649            .map(|chunk| {
4650                (
4651                    chunk.name.clone(),
4652                    chunk.kind.clone(),
4653                    chunk.start_line,
4654                    chunk.end_line,
4655                    chunk.exported,
4656                    chunk.embed_text.clone(),
4657                    chunk.snippet.clone(),
4658                )
4659            })
4660            .collect()
4661    }
4662
4663    #[test]
4664    fn collect_file_chunks_skips_oversized_file() {
4665        let dir = tempfile::tempdir().unwrap();
4666        let big = dir.path().join("huge.ts");
4667        // Just over the cap: a valid TS file that would otherwise yield chunks.
4668        let filler = "export const x = 1;\n"
4669            .repeat(((MAX_SEMANTIC_FILE_BYTES as usize) / "export const x = 1;\n".len()) + 16);
4670        std::fs::write(&big, &filler).unwrap();
4671        assert!(big.metadata().unwrap().len() > MAX_SEMANTIC_FILE_BYTES);
4672
4673        let mut parsers = HashMap::new();
4674        // Oversized → tracked with zero chunks, NOT an error (so the caller keeps
4675        // the file in metadata and freshness skips re-reading it).
4676        let chunks = collect_file_chunks(dir.path(), &big, &mut parsers).unwrap();
4677        assert!(chunks.is_empty(), "oversized file must yield no chunks");
4678
4679        // A small file of the same language still produces chunks.
4680        let small = dir.path().join("small.ts");
4681        std::fs::write(&small, "export function foo() { return 1; }\n").unwrap();
4682        let small_chunks = collect_file_chunks(dir.path(), &small, &mut parsers).unwrap();
4683        assert!(!small_chunks.is_empty(), "small file should still chunk");
4684    }
4685
4686    #[test]
4687    fn rejects_oversized_dimension_during_deserialization() {
4688        let mut bytes = Vec::new();
4689        bytes.push(1u8);
4690        bytes.extend_from_slice(&((MAX_DIMENSION as u32) + 1).to_le_bytes());
4691        bytes.extend_from_slice(&0u32.to_le_bytes());
4692        bytes.extend_from_slice(&0u32.to_le_bytes());
4693
4694        assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
4695    }
4696
4697    #[test]
4698    fn rejects_oversized_entry_count_during_deserialization() {
4699        let mut bytes = Vec::new();
4700        bytes.push(1u8);
4701        bytes.extend_from_slice(&(DEFAULT_DIMENSION as u32).to_le_bytes());
4702        bytes.extend_from_slice(&((MAX_ENTRIES as u32) + 1).to_le_bytes());
4703        bytes.extend_from_slice(&0u32.to_le_bytes());
4704
4705        assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
4706    }
4707
4708    #[test]
4709    fn invalidate_file_removes_entries_and_mtime() {
4710        let target = PathBuf::from("/src/main.rs");
4711        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4712        index.entries.push(EmbeddingEntry {
4713            chunk: SemanticChunk {
4714                file: target.clone(),
4715                name: "main".to_string(),
4716                kind: SymbolKind::Function,
4717                start_line: 0,
4718                end_line: 1,
4719                exported: false,
4720                embed_text: "main".to_string(),
4721                snippet: "fn main() {}".to_string(),
4722            },
4723            vector: vec![1.0; DEFAULT_DIMENSION],
4724        });
4725        index
4726            .file_mtimes
4727            .insert(target.clone(), SystemTime::UNIX_EPOCH);
4728        index.file_sizes.insert(target.clone(), 0);
4729
4730        index.invalidate_file(&target);
4731
4732        assert!(index.entries.is_empty());
4733        assert!(!index.file_mtimes.contains_key(&target));
4734        assert!(!index.file_sizes.contains_key(&target));
4735    }
4736
4737    #[test]
4738    fn refresh_missing_changed_file_is_purged_after_collect() {
4739        let temp = tempfile::tempdir().unwrap();
4740        let project_root = temp.path();
4741        let file = project_root.join("src/lib.rs");
4742        fs::create_dir_all(file.parent().unwrap()).unwrap();
4743        write_rust_file(&file, "vanished_symbol");
4744
4745        let mut index = build_test_index(project_root, std::slice::from_ref(&file));
4746        let original_size = *index.file_sizes.get(&file).unwrap();
4747        set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, original_size + 1);
4748        fs::remove_file(&file).unwrap();
4749
4750        let mut embed = test_vector_for_texts;
4751        let mut progress = |_done: usize, _total: usize| {};
4752        let summary = index
4753            .refresh_stale_files(
4754                project_root,
4755                std::slice::from_ref(&file),
4756                &mut embed,
4757                8,
4758                &mut progress,
4759            )
4760            .unwrap();
4761
4762        assert_eq!(summary.changed, 0);
4763        assert_eq!(summary.added, 0);
4764        assert_eq!(summary.deleted, 1);
4765        assert!(index.entries.is_empty());
4766        assert!(!index.file_mtimes.contains_key(&file));
4767        assert!(!index.file_sizes.contains_key(&file));
4768        assert!(!index.file_hashes.contains_key(&file));
4769    }
4770
4771    #[test]
4772    fn refresh_collect_error_for_existing_path_preserves_cached_entry() {
4773        let temp = tempfile::tempdir().unwrap();
4774        let project_root = temp.path();
4775        let file = project_root.join("src/lib.rs");
4776        fs::create_dir_all(file.parent().unwrap()).unwrap();
4777        write_rust_file(&file, "kept_symbol");
4778
4779        let mut index = build_test_index(project_root, std::slice::from_ref(&file));
4780        let original_entry_count = index.entries.len();
4781        let original_mtime = *index.file_mtimes.get(&file).unwrap();
4782        let original_size = *index.file_sizes.get(&file).unwrap();
4783
4784        let stale_mtime = SystemTime::UNIX_EPOCH;
4785        set_file_metadata(&mut index, &file, stale_mtime, original_size + 1);
4786        fs::remove_file(&file).unwrap();
4787        fs::create_dir(&file).unwrap();
4788
4789        let mut embed = test_vector_for_texts;
4790        let mut progress = |_done: usize, _total: usize| {};
4791        let summary = index
4792            .refresh_stale_files(
4793                project_root,
4794                std::slice::from_ref(&file),
4795                &mut embed,
4796                8,
4797                &mut progress,
4798            )
4799            .unwrap();
4800
4801        assert_eq!(summary.changed, 0);
4802        assert_eq!(summary.added, 0);
4803        assert_eq!(summary.deleted, 0);
4804        assert_eq!(index.entries.len(), original_entry_count);
4805        assert!(index
4806            .entries
4807            .iter()
4808            .any(|entry| entry.chunk.name == "kept_symbol"));
4809        assert_eq!(index.file_mtimes.get(&file), Some(&stale_mtime));
4810        assert_ne!(index.file_mtimes.get(&file), Some(&original_mtime));
4811        assert_eq!(index.file_sizes.get(&file), Some(&(original_size + 1)));
4812    }
4813
4814    #[test]
4815    fn refresh_never_indexed_file_error_does_not_record_mtime() {
4816        let temp = tempfile::tempdir().unwrap();
4817        let project_root = temp.path();
4818        let missing = project_root.join("src/missing.rs");
4819        fs::create_dir_all(missing.parent().unwrap()).unwrap();
4820
4821        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4822        let mut embed = test_vector_for_texts;
4823        let mut progress = |_done: usize, _total: usize| {};
4824        let summary = index
4825            .refresh_stale_files(
4826                project_root,
4827                std::slice::from_ref(&missing),
4828                &mut embed,
4829                8,
4830                &mut progress,
4831            )
4832            .unwrap();
4833
4834        assert_eq!(summary.added, 0);
4835        assert_eq!(summary.changed, 0);
4836        assert_eq!(summary.deleted, 0);
4837        assert!(!index.file_mtimes.contains_key(&missing));
4838        assert!(!index.file_sizes.contains_key(&missing));
4839        assert!(index.entries.is_empty());
4840    }
4841
4842    #[test]
4843    fn refresh_reports_added_for_new_files() {
4844        let temp = tempfile::tempdir().unwrap();
4845        let project_root = temp.path();
4846        let existing = project_root.join("src/lib.rs");
4847        let added = project_root.join("src/new.rs");
4848        fs::create_dir_all(existing.parent().unwrap()).unwrap();
4849        write_rust_file(&existing, "existing_symbol");
4850        write_rust_file(&added, "added_symbol");
4851
4852        let mut index = build_test_index(project_root, std::slice::from_ref(&existing));
4853        let mut embed = test_vector_for_texts;
4854        let mut progress = |_done: usize, _total: usize| {};
4855        let summary = index
4856            .refresh_stale_files(
4857                project_root,
4858                &[existing.clone(), added.clone()],
4859                &mut embed,
4860                8,
4861                &mut progress,
4862            )
4863            .unwrap();
4864
4865        assert_eq!(summary.added, 1);
4866        assert_eq!(summary.changed, 0);
4867        assert_eq!(summary.deleted, 0);
4868        assert_eq!(summary.total_processed, 2);
4869        assert!(index.file_mtimes.contains_key(&added));
4870        assert!(index.entries.iter().any(|entry| entry.chunk.file == added));
4871    }
4872
4873    #[test]
4874    fn refresh_reports_deleted_for_removed_files() {
4875        let temp = tempfile::tempdir().unwrap();
4876        let project_root = temp.path();
4877        let deleted = project_root.join("src/deleted.rs");
4878        fs::create_dir_all(deleted.parent().unwrap()).unwrap();
4879        write_rust_file(&deleted, "deleted_symbol");
4880
4881        let mut index = build_test_index(project_root, std::slice::from_ref(&deleted));
4882        fs::remove_file(&deleted).unwrap();
4883
4884        let mut embed = test_vector_for_texts;
4885        let mut progress = |_done: usize, _total: usize| {};
4886        let summary = index
4887            .refresh_stale_files(project_root, &[], &mut embed, 8, &mut progress)
4888            .unwrap();
4889
4890        assert_eq!(summary.deleted, 1);
4891        assert_eq!(summary.changed, 0);
4892        assert_eq!(summary.added, 0);
4893        assert_eq!(summary.total_processed, 1);
4894        assert!(!index.file_mtimes.contains_key(&deleted));
4895        assert!(index.entries.is_empty());
4896    }
4897
4898    #[test]
4899    fn refresh_reports_changed_for_modified_files() {
4900        let temp = tempfile::tempdir().unwrap();
4901        let project_root = temp.path();
4902        let file = project_root.join("src/lib.rs");
4903        fs::create_dir_all(file.parent().unwrap()).unwrap();
4904        write_rust_file(&file, "old_symbol");
4905
4906        let mut index = build_test_index(project_root, std::slice::from_ref(&file));
4907        set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, 0);
4908        write_rust_file(&file, "new_symbol");
4909
4910        let mut embed = test_vector_for_texts;
4911        let mut progress = |_done: usize, _total: usize| {};
4912        let summary = index
4913            .refresh_stale_files(
4914                project_root,
4915                std::slice::from_ref(&file),
4916                &mut embed,
4917                8,
4918                &mut progress,
4919            )
4920            .unwrap();
4921
4922        assert_eq!(summary.changed, 1);
4923        assert_eq!(summary.added, 0);
4924        assert_eq!(summary.deleted, 0);
4925        assert_eq!(summary.total_processed, 1);
4926        assert!(index
4927            .entries
4928            .iter()
4929            .any(|entry| entry.chunk.name == "new_symbol"));
4930        assert!(!index
4931            .entries
4932            .iter()
4933            .any(|entry| entry.chunk.name == "old_symbol"));
4934    }
4935
4936    #[test]
4937    fn refresh_all_clean_reports_zero_counts_and_no_embedding_work() {
4938        let temp = tempfile::tempdir().unwrap();
4939        let project_root = temp.path();
4940        let file = project_root.join("src/lib.rs");
4941        fs::create_dir_all(file.parent().unwrap()).unwrap();
4942        write_rust_file(&file, "clean_symbol");
4943
4944        let mut index = build_test_index(project_root, std::slice::from_ref(&file));
4945        let original_entries = index.entries.len();
4946        let mut embed_called = false;
4947        let mut embed = |texts: Vec<String>| {
4948            embed_called = true;
4949            test_vector_for_texts(texts)
4950        };
4951        let mut progress = |_done: usize, _total: usize| {};
4952        let summary = index
4953            .refresh_stale_files(
4954                project_root,
4955                std::slice::from_ref(&file),
4956                &mut embed,
4957                8,
4958                &mut progress,
4959            )
4960            .unwrap();
4961
4962        assert!(summary.is_noop());
4963        assert_eq!(summary.total_processed, 1);
4964        assert!(!embed_called);
4965        assert_eq!(index.entries.len(), original_entries);
4966    }
4967
4968    #[test]
4969    fn detects_missing_onnx_runtime_from_dynamic_load_error() {
4970        let message = "Failed to load ONNX Runtime shared library libonnxruntime.dylib via dlopen: no such file";
4971
4972        assert!(is_onnx_runtime_unavailable(message));
4973    }
4974
4975    #[test]
4976    fn formats_missing_onnx_runtime_with_install_hint() {
4977        let message = format_embedding_init_error(
4978            "Failed to load ONNX Runtime shared library libonnxruntime.so via dlopen: no such file",
4979        );
4980
4981        assert!(message.starts_with("ONNX Runtime not found. Install via:"));
4982        assert!(message.contains("Original error:"));
4983    }
4984
4985    #[test]
4986    fn interactive_query_embedding_model_caps_remote_timeout() {
4987        let mut config = SemanticBackendConfig {
4988            backend: SemanticBackend::OpenAiCompatible,
4989            model: "test-embedding".to_string(),
4990            base_url: Some("http://127.0.0.1:9".to_string()),
4991            api_key_env: None,
4992            timeout_ms: 0,
4993            max_batch_size: 64,
4994            max_files: 20_000,
4995        };
4996
4997        let build_model = SemanticEmbeddingModel::from_config(&config).unwrap();
4998        let query_model = SemanticEmbeddingModel::from_config_for_query(&config).unwrap();
4999        assert_eq!(
5000            build_model.timeout_ms(),
5001            DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS,
5002            "background build keeps the longer default embedding timeout"
5003        );
5004        assert_eq!(
5005            query_model.timeout_ms(),
5006            DEFAULT_QUERY_EMBEDDING_TIMEOUT_MS,
5007            "interactive query embedding is capped below the dispatch transport timeout"
5008        );
5009
5010        config.timeout_ms = 60_000;
5011        let query_model = SemanticEmbeddingModel::from_config_for_query(&config).unwrap();
5012        assert_eq!(
5013            query_model.timeout_ms(),
5014            DEFAULT_QUERY_EMBEDDING_TIMEOUT_MS,
5015            "explicitly long backend timeouts are capped for interactive queries"
5016        );
5017
5018        config.timeout_ms = 3_000;
5019        let query_model = SemanticEmbeddingModel::from_config_for_query(&config).unwrap();
5020        assert_eq!(
5021            query_model.timeout_ms(),
5022            3_000,
5023            "shorter explicit timeouts are respected for interactive queries"
5024        );
5025    }
5026
5027    #[test]
5028    fn openai_compatible_backend_embeds_with_mock_server() {
5029        let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
5030            assert!(request_line.starts_with("POST "));
5031            assert_eq!(path, "/v1/embeddings");
5032            "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0},{\"embedding\":[0.4,0.5,0.6],\"index\":1}]}".to_string()
5033        });
5034
5035        let config = SemanticBackendConfig {
5036            backend: SemanticBackend::OpenAiCompatible,
5037            model: "test-embedding".to_string(),
5038            base_url: Some(base_url),
5039            api_key_env: None,
5040            timeout_ms: 5_000,
5041            max_batch_size: 64,
5042            max_files: 20_000,
5043        };
5044
5045        let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
5046        let vectors = model
5047            .embed(vec!["hello".to_string(), "world".to_string()])
5048            .unwrap();
5049
5050        assert_eq!(vectors, vec![vec![0.1, 0.2, 0.3], vec![0.4, 0.5, 0.6]]);
5051        handle.join().unwrap();
5052    }
5053
5054    /// Regression for issue #36: AFT was sending TWO Content-Type headers
5055    /// on the OpenAI embeddings request — once implicitly via `.json(&body)`
5056    /// and again explicitly via `.header("Content-Type", "application/json")`.
5057    /// reqwest's `.header()` calls `HeaderMap::append`, which produces two
5058    /// headers on the wire. OpenAI's /v1/embeddings endpoint rejects that
5059    /// with `HTTP 400 "you must provide a model parameter"` even though the
5060    /// body actually contains `model`. The fix is to drop the explicit
5061    /// `.header("Content-Type", ...)` call. This test pins that we send
5062    /// exactly one Content-Type header.
5063    #[test]
5064    fn openai_compatible_request_has_single_content_type_header() {
5065        use std::sync::{Arc, Mutex};
5066        let captured: Arc<Mutex<Vec<u8>>> = Arc::new(Mutex::new(Vec::new()));
5067        let captured_for_thread = Arc::clone(&captured);
5068
5069        let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
5070        let addr = listener.local_addr().expect("local addr");
5071        let handle = thread::spawn(move || {
5072            let (mut stream, _) = listener.accept().expect("accept");
5073            let mut buf = Vec::new();
5074            let mut chunk = [0u8; 4096];
5075            let mut header_end = None;
5076            let mut content_length = 0usize;
5077            loop {
5078                let n = stream.read(&mut chunk).expect("read");
5079                if n == 0 {
5080                    break;
5081                }
5082                buf.extend_from_slice(&chunk[..n]);
5083                if header_end.is_none() {
5084                    if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
5085                        header_end = Some(pos + 4);
5086                        for line in String::from_utf8_lossy(&buf[..pos + 4]).lines() {
5087                            if let Some(value) = line.strip_prefix("Content-Length:") {
5088                                content_length = value.trim().parse::<usize>().unwrap_or(0);
5089                            }
5090                        }
5091                    }
5092                }
5093                if let Some(end) = header_end {
5094                    if buf.len() >= end + content_length {
5095                        break;
5096                    }
5097                }
5098            }
5099            *captured_for_thread.lock().unwrap() = buf;
5100            let body = "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0}]}";
5101            let response = format!(
5102                "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
5103                body.len(),
5104                body
5105            );
5106            let _ = stream.write_all(response.as_bytes());
5107        });
5108
5109        let config = SemanticBackendConfig {
5110            backend: SemanticBackend::OpenAiCompatible,
5111            model: "text-embedding-3-small".to_string(),
5112            base_url: Some(format!("http://{}", addr)),
5113            api_key_env: None,
5114            timeout_ms: 5_000,
5115            max_batch_size: 64,
5116            max_files: 20_000,
5117        };
5118        let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
5119        let _ = model.embed(vec!["probe".to_string()]).unwrap();
5120        handle.join().unwrap();
5121
5122        let bytes = captured.lock().unwrap().clone();
5123        let request = String::from_utf8_lossy(&bytes);
5124
5125        // Lowercase line counts because HTTP headers are case-insensitive
5126        // and reqwest may emit `content-type` in lowercase under HTTP/2.
5127        let content_type_lines = request
5128            .lines()
5129            .filter(|line| {
5130                let lower = line.to_ascii_lowercase();
5131                lower.starts_with("content-type:")
5132            })
5133            .count();
5134        assert_eq!(
5135            content_type_lines, 1,
5136            "expected exactly one Content-Type header but found {content_type_lines}; full request:\n{request}",
5137        );
5138
5139        // The body must still include the model field — pin this so a future
5140        // change can't accidentally drop `model` while fixing duplicate headers.
5141        assert!(
5142            request.contains(r#""model":"text-embedding-3-small""#),
5143            "request body should contain model field; full request:\n{request}",
5144        );
5145    }
5146
5147    #[test]
5148    fn ollama_backend_embeds_with_mock_server() {
5149        let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
5150            assert!(request_line.starts_with("POST "));
5151            assert_eq!(path, "/api/embed");
5152            "{\"embeddings\":[[0.7,0.8,0.9],[1.0,1.1,1.2]]}".to_string()
5153        });
5154
5155        let config = SemanticBackendConfig {
5156            backend: SemanticBackend::Ollama,
5157            model: "embeddinggemma".to_string(),
5158            base_url: Some(base_url),
5159            api_key_env: None,
5160            timeout_ms: 5_000,
5161            max_batch_size: 64,
5162            max_files: 20_000,
5163        };
5164
5165        let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
5166        let vectors = model
5167            .embed(vec!["hello".to_string(), "world".to_string()])
5168            .unwrap();
5169
5170        assert_eq!(vectors, vec![vec![0.7, 0.8, 0.9], vec![1.0, 1.1, 1.2]]);
5171        handle.join().unwrap();
5172    }
5173
5174    #[test]
5175    fn read_from_disk_rejects_fingerprint_mismatch() {
5176        let storage = tempfile::tempdir().unwrap();
5177        let project_key = "proj";
5178
5179        let project_root = test_project_root();
5180        let file = project_root.join("src/main.rs");
5181        let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
5182        index.entries.push(EmbeddingEntry {
5183            chunk: SemanticChunk {
5184                file: file.clone(),
5185                name: "handle_request".to_string(),
5186                kind: SymbolKind::Function,
5187                start_line: 10,
5188                end_line: 25,
5189                exported: true,
5190                embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
5191                snippet: "fn handle_request() {}".to_string(),
5192            },
5193            vector: vec![0.1, 0.2, 0.3],
5194        });
5195        index.dimension = 3;
5196        index
5197            .file_mtimes
5198            .insert(file.clone(), SystemTime::UNIX_EPOCH);
5199        index.file_sizes.insert(file, 0);
5200        index.set_fingerprint(SemanticIndexFingerprint {
5201            backend: "openai_compatible".to_string(),
5202            model: "test-embedding".to_string(),
5203            base_url: "http://127.0.0.1:1234/v1".to_string(),
5204            dimension: 3,
5205            chunking_version: default_chunking_version(),
5206        });
5207        index.write_to_disk(storage.path(), project_key);
5208
5209        let matching = index.fingerprint().unwrap().as_string();
5210        assert!(SemanticIndex::read_from_disk(
5211            storage.path(),
5212            project_key,
5213            &project_root,
5214            false,
5215            Some(&matching),
5216        )
5217        .is_some());
5218
5219        let mismatched = SemanticIndexFingerprint {
5220            backend: "ollama".to_string(),
5221            model: "embeddinggemma".to_string(),
5222            base_url: "http://127.0.0.1:11434".to_string(),
5223            dimension: 3,
5224            chunking_version: default_chunking_version(),
5225        }
5226        .as_string();
5227        assert!(SemanticIndex::read_from_disk(
5228            storage.path(),
5229            project_key,
5230            &project_root,
5231            false,
5232            Some(&mismatched),
5233        )
5234        .is_none());
5235    }
5236
5237    #[test]
5238    fn read_from_disk_rejects_v3_cache_for_snippet_rebuild() {
5239        let storage = tempfile::tempdir().unwrap();
5240        let project_key = "proj-v3";
5241        let dir = storage.path().join("semantic").join(project_key);
5242        fs::create_dir_all(&dir).unwrap();
5243
5244        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
5245        index.entries.push(EmbeddingEntry {
5246            chunk: SemanticChunk {
5247                file: PathBuf::from("/src/main.rs"),
5248                name: "handle_request".to_string(),
5249                kind: SymbolKind::Function,
5250                start_line: 0,
5251                end_line: 0,
5252                exported: true,
5253                embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
5254                snippet: "fn handle_request() {}".to_string(),
5255            },
5256            vector: vec![0.1, 0.2, 0.3],
5257        });
5258        index.dimension = 3;
5259        index
5260            .file_mtimes
5261            .insert(PathBuf::from("/src/main.rs"), SystemTime::UNIX_EPOCH);
5262        index.file_sizes.insert(PathBuf::from("/src/main.rs"), 0);
5263        let fingerprint = SemanticIndexFingerprint {
5264            backend: "fastembed".to_string(),
5265            model: "test".to_string(),
5266            base_url: FALLBACK_BACKEND.to_string(),
5267            dimension: 3,
5268            chunking_version: default_chunking_version(),
5269        };
5270        index.set_fingerprint(fingerprint.clone());
5271
5272        let mut bytes = index.to_bytes();
5273        bytes[0] = SEMANTIC_INDEX_VERSION_V3;
5274        fs::write(dir.join("semantic.bin"), bytes).unwrap();
5275
5276        assert!(SemanticIndex::read_from_disk(
5277            storage.path(),
5278            project_key,
5279            &test_project_root(),
5280            false,
5281            Some(&fingerprint.as_string())
5282        )
5283        .is_none());
5284        assert!(!dir.join("semantic.bin").exists());
5285    }
5286
5287    fn make_symbol(kind: SymbolKind, name: &str, start: u32, end: u32) -> crate::symbols::Symbol {
5288        crate::symbols::Symbol {
5289            name: name.to_string(),
5290            kind,
5291            range: crate::symbols::Range {
5292                start_line: start,
5293                start_col: 0,
5294                end_line: end,
5295                end_col: 0,
5296            },
5297            signature: None,
5298            scope_chain: Vec::new(),
5299            exported: false,
5300            parent: None,
5301        }
5302    }
5303
5304    /// Heading symbols (Markdown / HTML headings) must NOT be indexed —
5305    /// they overwhelmingly dominated semantic results even on code-shaped
5306    /// queries because heading prose embeds far more strongly than code
5307    /// chunks. Skipping headings keeps aft_search a code-finder.
5308    #[test]
5309    fn symbols_to_chunks_skips_heading_symbols() {
5310        let project_root = PathBuf::from("/proj");
5311        let file = project_root.join("README.md");
5312        let source = "# Title\n\nbody text\n\n## Section\n\nmore text\n";
5313
5314        let symbols = vec![
5315            make_symbol(SymbolKind::Heading, "Title", 0, 2),
5316            make_symbol(SymbolKind::Heading, "Section", 4, 6),
5317        ];
5318
5319        let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
5320        assert!(
5321            chunks.is_empty(),
5322            "Heading symbols must be filtered out before embedding; got {} chunk(s)",
5323            chunks.len()
5324        );
5325    }
5326
5327    /// A symbol with an enormous signature (e.g. a YAML/Kubernetes CronJob
5328    /// whose inline `command:` script is parsed into the signature) must not
5329    /// produce an embed_text that overflows the embedding backend's physical
5330    /// batch. Before the clamp, the unbounded `signature:` append created a
5331    /// multi-KB input that aborted the whole index build and degraded every
5332    /// search to lexical-only.
5333    #[test]
5334    fn build_embed_text_clamps_oversized_signature() {
5335        let project_root = PathBuf::from("/proj");
5336        let file = project_root.join("cronjob.yaml");
5337        let huge_sig = "kubectl ".repeat(2000); // ~16 KB
5338        let source = "apiVersion: batch/v1\nkind: CronJob\n";
5339
5340        let mut symbol = make_symbol(SymbolKind::Class, "cluster-janitor", 0, 1);
5341        symbol.signature = Some(huge_sig);
5342
5343        let text = build_embed_text(&symbol, source, &file, &project_root);
5344        assert!(
5345            text.chars().count() <= MAX_EMBED_TEXT_CHARS,
5346            "embed_text must be clamped to {} chars, got {}",
5347            MAX_EMBED_TEXT_CHARS,
5348            text.chars().count()
5349        );
5350    }
5351
5352    /// Code symbols (functions, classes, methods, structs, etc.) must still
5353    /// be indexed alongside the heading skip — otherwise we'd starve the
5354    /// index entirely.
5355    #[test]
5356    fn symbols_to_chunks_keeps_code_symbols_alongside_skipped_headings() {
5357        let project_root = PathBuf::from("/proj");
5358        let file = project_root.join("src/lib.rs");
5359        let source = "pub fn handle_request() -> bool {\n    true\n}\n";
5360
5361        let symbols = vec![
5362            // A heading mixed in (e.g. from a doc comment block elsewhere).
5363            make_symbol(SymbolKind::Heading, "doc heading", 0, 1),
5364            make_symbol(SymbolKind::Function, "handle_request", 0, 2),
5365            make_symbol(SymbolKind::Struct, "AuthService", 4, 6),
5366        ];
5367
5368        let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
5369        assert_eq!(
5370            chunks.len(),
5371            3,
5372            "Expected file-summary + 2 code chunks (Function + Struct), got {}",
5373            chunks.len()
5374        );
5375        let names: Vec<&str> = chunks.iter().map(|c| c.name.as_str()).collect();
5376        assert!(chunks
5377            .iter()
5378            .any(|chunk| matches!(chunk.kind, SymbolKind::FileSummary)));
5379        assert!(names.contains(&"handle_request"));
5380        assert!(names.contains(&"AuthService"));
5381        assert!(
5382            !names.contains(&"doc heading"),
5383            "Heading symbol leaked into chunks: {names:?}"
5384        );
5385    }
5386
5387    #[test]
5388    fn validate_ssrf_allows_loopback_hostnames() {
5389        // Loopback hostnames are explicitly allowed so self-hosted backends
5390        // (Ollama at http://localhost:11434) work at their default config.
5391        for host in &[
5392            "http://localhost",
5393            "http://localhost:8080",
5394            "http://localhost:11434", // Ollama default
5395            "http://localhost.localdomain",
5396            "http://foo.localhost",
5397        ] {
5398            assert!(
5399                validate_base_url_no_ssrf(host).is_ok(),
5400                "Expected {host} to be allowed (loopback), got: {:?}",
5401                validate_base_url_no_ssrf(host)
5402            );
5403        }
5404    }
5405
5406    #[test]
5407    fn validate_ssrf_allows_loopback_ips() {
5408        // 127.0.0.0/8 is loopback — by definition same-machine and not an
5409        // SSRF target. Allow it so Ollama at http://127.0.0.1:11434 works.
5410        for url in &[
5411            "http://127.0.0.1",
5412            "http://127.0.0.1:11434", // Ollama default
5413            "http://127.0.0.1:8080",
5414            "http://127.1.2.3",
5415        ] {
5416            let result = validate_base_url_no_ssrf(url);
5417            assert!(
5418                result.is_ok(),
5419                "Expected {url} to be allowed (loopback), got: {:?}",
5420                result
5421            );
5422        }
5423    }
5424
5425    #[test]
5426    fn validate_ssrf_rejects_private_non_loopback_ips() {
5427        // Non-loopback private/reserved IPs remain rejected — homelab/intranet
5428        // services on LAN IPs are real SSRF targets even though the user
5429        // configured them. Users who want this can opt in by binding the
5430        // service to a public-routable address.
5431        for url in &[
5432            "http://192.168.1.1",
5433            "http://10.0.0.1",
5434            "http://172.16.0.1",
5435            "http://169.254.169.254",
5436            "http://100.64.0.1",
5437        ] {
5438            let result = validate_base_url_no_ssrf(url);
5439            assert!(
5440                result.is_err(),
5441                "Expected {url} to be rejected (non-loopback private), got: {:?}",
5442                result
5443            );
5444        }
5445    }
5446
5447    #[test]
5448    fn validate_ssrf_rejects_mdns_local_hostnames() {
5449        // mDNS .local hostnames typically resolve to LAN devices, not
5450        // loopback. Rejecting them before DNS lookup gives a clearer error.
5451        for host in &[
5452            "http://printer.local",
5453            "http://nas.local:8080",
5454            "http://homelab.local",
5455        ] {
5456            let result = validate_base_url_no_ssrf(host);
5457            assert!(
5458                result.is_err(),
5459                "Expected {host} to be rejected (mDNS), got: {:?}",
5460                result
5461            );
5462        }
5463    }
5464
5465    #[test]
5466    fn normalize_base_url_allows_localhost_for_tests() {
5467        // normalize_base_url itself should NOT block localhost — only
5468        // validate_base_url_no_ssrf does. Tests construct backends directly.
5469        assert!(normalize_base_url("http://127.0.0.1:9999").is_ok());
5470        assert!(normalize_base_url("http://localhost:8080").is_ok());
5471    }
5472
5473    #[test]
5474    fn ssrf_guard_blocks_reserved_ranges_but_allows_loopback() {
5475        use std::net::IpAddr;
5476        let blocked = |s: &str| is_private_non_loopback_ip(&s.parse::<IpAddr>().unwrap());
5477
5478        // Private / link-local / CGNAT — blocked (unchanged behavior).
5479        assert!(blocked("10.0.0.1"));
5480        assert!(blocked("192.168.1.1"));
5481        assert!(blocked("169.254.0.1"));
5482        assert!(blocked("100.64.0.1"));
5483        // Newly covered by delegating to url_fetch's complete list:
5484        assert!(
5485            blocked("198.18.0.1"),
5486            "RFC2544 benchmark range must be blocked"
5487        );
5488        assert!(blocked("224.0.0.1"), "multicast must be blocked");
5489        assert!(blocked("fc00::1"), "IPv6 ULA must be blocked");
5490        assert!(blocked("fe80::1"), "IPv6 link-local must be blocked");
5491
5492        // Loopback — allowed (local Ollama endpoint), incl. IPv4-mapped form.
5493        assert!(!blocked("127.0.0.1"), "loopback must stay allowed");
5494        assert!(!blocked("::1"), "IPv6 loopback must stay allowed");
5495        assert!(
5496            !blocked("::ffff:127.0.0.1"),
5497            "IPv4-mapped loopback must stay allowed (matches prior carve-out)"
5498        );
5499
5500        // A public address must NOT be flagged.
5501        assert!(!blocked("8.8.8.8"));
5502    }
5503
5504    /// Pin the user-facing wording of the ONNX version-mismatch error.
5505    /// The auto-fix path MUST be listed first because it's the only safe
5506    /// option that doesn't require sudo or risk breaking other apps that
5507    /// link the system library. Regression of any of these strings would
5508    /// either mislead users (system rm before auto-fix) or break the
5509    /// `aft doctor --fix` discovery path.
5510    #[test]
5511    fn ort_mismatch_message_recommends_auto_fix_first() {
5512        let msg =
5513            format_ort_version_mismatch("1.9.0", "/usr/lib/x86_64-linux-gnu/libonnxruntime.so");
5514
5515        // The reported version and path must appear verbatim.
5516        assert!(
5517            msg.contains("v1.9.0"),
5518            "should report detected version: {msg}"
5519        );
5520        assert!(
5521            msg.contains("/usr/lib/x86_64-linux-gnu/libonnxruntime.so"),
5522            "should report system path: {msg}"
5523        );
5524        assert!(msg.contains("v1.20+"), "should state requirement: {msg}");
5525
5526        // Solution ordering: auto-fix is #1, system rm is #2, install is #3.
5527        let auto_fix_pos = msg
5528            .find("Auto-fix")
5529            .expect("Auto-fix solution missing — users won't discover --fix");
5530        let remove_pos = msg
5531            .find("Remove the old library")
5532            .expect("system-rm solution missing");
5533        assert!(
5534            auto_fix_pos < remove_pos,
5535            "Auto-fix must come before manual rm — see PR comment thread"
5536        );
5537
5538        // The auto-fix command must be runnable as-is on a fresh system.
5539        assert!(
5540            msg.contains("npx @cortexkit/aft doctor --fix"),
5541            "auto-fix command must be present and copy-pasteable: {msg}"
5542        );
5543    }
5544
5545    #[cfg(any(target_os = "linux", target_os = "macos"))]
5546    #[test]
5547    fn loaded_ort_version_detection_prefers_actual_loaded_library_path() {
5548        let requested = "libonnxruntime.so";
5549        let actual = "/usr/local/lib/libonnxruntime.so.1.19.0";
5550
5551        assert_eq!(detect_ort_version_from_path(requested), None);
5552        let (version, source) =
5553            detect_ort_version_from_resolved_or_requested(Some(actual.to_string()), requested);
5554
5555        assert_eq!(version, Some("1.19.0".to_string()));
5556        assert_eq!(source, actual);
5557
5558        let msg = format_ort_version_mismatch(&version.unwrap(), &source);
5559        assert!(msg.contains("v1.19.0"));
5560        assert!(msg.contains(actual));
5561    }
5562
5563    /// macOS dylib paths must not produce a malformed message when the
5564    /// system path lacks a trailing slash. This is a regression guard
5565    /// for the "{}\n{}" format string contract.
5566    #[test]
5567    fn ort_mismatch_message_handles_macos_dylib_path() {
5568        let msg = format_ort_version_mismatch("1.9.0", "/opt/homebrew/lib/libonnxruntime.dylib");
5569        assert!(msg.contains("v1.9.0"));
5570        assert!(msg.contains("/opt/homebrew/lib/libonnxruntime.dylib"));
5571        // The dylib path must appear in the auto-fix paragraph (single
5572        // quotes around it) AND in the manual-rm paragraph; verify
5573        // both placements survived the format string.
5574        assert!(
5575            msg.contains("'/opt/homebrew/lib/libonnxruntime.dylib'"),
5576            "system path should be quoted in the auto-fix sentence: {msg}"
5577        );
5578    }
5579}