Skip to main content

aft/
semantic_index.rs

1use crate::cache_freshness::{self, FileFreshness, FreshnessVerdict};
2use crate::config::{SemanticBackend, SemanticBackendConfig};
3use crate::fs_lock;
4use crate::parser::{detect_language, extract_symbols_from_tree, grammar_for};
5use crate::search_index::{cache_relative_path, cached_path_under_root};
6use crate::symbols::{Symbol, SymbolKind};
7use crate::{slog_info, slog_warn};
8
9use crate::local_embed::LocalEmbedder;
10use rayon::prelude::*;
11use reqwest::blocking::Client;
12use serde::{Deserialize, Serialize};
13use std::collections::{HashMap, HashSet, VecDeque};
14use std::env;
15use std::fmt::Display;
16use std::fs;
17use std::io::{self, BufReader, BufWriter, Cursor, Read, Write};
18use std::path::{Path, PathBuf};
19use std::sync::Mutex;
20use std::time::Duration;
21use std::time::SystemTime;
22use tree_sitter::Parser;
23use url::Url;
24
25const DEFAULT_DIMENSION: usize = 384;
26const MAX_ENTRIES: usize = 1_000_000;
27// Covers high-dimensional backends such as OpenAI text-embedding-3-large (3072)
28// and common local models (4096) while keeping a bounded supported shape.
29const MAX_DIMENSION: usize = 4096;
30const F32_BYTES: usize = std::mem::size_of::<f32>();
31const HEADER_BYTES_V1: usize = 9;
32const HEADER_BYTES_V2: usize = 13;
33const ONNX_RUNTIME_INSTALL_HINT: &str =
34    "ONNX Runtime not found. Install via: brew install onnxruntime (macOS), \
35     apt install libonnxruntime (Linux), or place onnxruntime.dll in your PATH (Windows). \
36     AFT can auto-download ONNX Runtime — run `npx @cortexkit/aft doctor` to diagnose.";
37
38const SEMANTIC_INDEX_VERSION_V1: u8 = 1;
39const SEMANTIC_INDEX_VERSION_V2: u8 = 2;
40/// V3 adds subsec_nanos to the file-mtime table so staleness detection survives
41/// restart round-trips on filesystems with subsecond mtime precision (APFS,
42/// ext4 with nsec, NTFS). V1/V2 persisted whole-second mtimes only, which
43/// caused every restart to flag ~99% of files as stale and re-embed them.
44const SEMANTIC_INDEX_VERSION_V3: u8 = 3;
45/// V4 keeps the V3 on-disk layout but rebuilds persisted snippets once after
46/// fixing symbol ranges that were incorrectly treated as 1-based.
47const SEMANTIC_INDEX_VERSION_V4: u8 = 4;
48/// V5 adds file sizes to the file metadata table so incremental staleness
49/// detection can catch content changes even when mtime precision misses them.
50const SEMANTIC_INDEX_VERSION_V5: u8 = 5;
51/// V6 stores paths relative to project_root and adds content hashes.
52const SEMANTIC_INDEX_VERSION_V6: u8 = 6;
53/// V7 adds qualified symbol names for ranking metadata without changing embeddings.
54const SEMANTIC_INDEX_VERSION_V7: u8 = 7;
55const DEFAULT_OPENAI_EMBEDDING_PATH: &str = "/embeddings";
56const DEFAULT_OLLAMA_EMBEDDING_PATH: &str = "/api/embed";
57// Build/refresh embedding requests keep a larger budget because they run on
58// background workers and often batch many texts through a cold local backend.
59const DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS: u64 = 25_000;
60// Interactive query embedding runs inside semantic_search dispatch; keep it
61// short so slow/unreachable remote backends degrade to lexical quickly.
62const DEFAULT_QUERY_EMBEDDING_TIMEOUT_MS: u64 = 8_000;
63const DEFAULT_MAX_BATCH_SIZE: usize = 64;
64const QUERY_EMBEDDING_CACHE_CAP: usize = 1_000;
65const FALLBACK_BACKEND: &str = "none";
66const EMBEDDING_REQUEST_MAX_ATTEMPTS: usize = 3;
67const EMBEDDING_REQUEST_BACKOFF_MS: [u64; 2] = [500, 1_000];
68static SEMANTIC_LOCK_ACQUIRE_MUTEX: Mutex<()> = Mutex::new(());
69
70pub struct SemanticIndexLock {
71    _guard: fs_lock::LockGuard,
72}
73
74impl SemanticIndexLock {
75    pub fn acquire(storage_dir: &Path, project_key: &str) -> std::io::Result<Self> {
76        let dir = storage_dir.join("semantic").join(project_key);
77        fs::create_dir_all(&dir)?;
78        let path = dir.join("cache.lock");
79        let _acquire_guard = SEMANTIC_LOCK_ACQUIRE_MUTEX
80            .lock()
81            .map_err(|_| std::io::Error::other("semantic cache lock acquisition mutex poisoned"))?;
82        fs_lock::try_acquire(&path, Duration::from_secs(2))
83            .map(|guard| Self { _guard: guard })
84            .map_err(|error| match error {
85                fs_lock::AcquireError::Timeout => {
86                    std::io::Error::other("timed out acquiring semantic cache lock")
87                }
88                fs_lock::AcquireError::Io(error) => error,
89            })
90    }
91}
92
93#[derive(Debug, Clone, Serialize, Deserialize)]
94pub struct SemanticIndexFingerprint {
95    pub backend: String,
96    pub model: String,
97    #[serde(default)]
98    pub base_url: String,
99    pub dimension: usize,
100    #[serde(default = "default_chunking_version")]
101    pub chunking_version: u32,
102}
103
104fn default_chunking_version() -> u32 {
105    2
106}
107
108impl SemanticIndexFingerprint {
109    fn from_config(config: &SemanticBackendConfig, dimension: usize) -> Self {
110        // Use normalized URL for fingerprinting so cosmetic differences
111        // (e.g. "http://host/v1" vs "http://host/v1/") don't cause rebuilds.
112        let base_url = config
113            .base_url
114            .as_ref()
115            .and_then(|u| normalize_base_url(u).ok())
116            .unwrap_or_else(|| FALLBACK_BACKEND.to_string());
117        Self {
118            backend: config.backend.as_str().to_string(),
119            model: config.model.clone(),
120            base_url,
121            dimension,
122            chunking_version: default_chunking_version(),
123        }
124    }
125
126    pub fn as_string(&self) -> String {
127        serde_json::to_string(self).unwrap_or_else(|_| String::new())
128    }
129
130    fn matches_expected(&self, expected: &str) -> bool {
131        let encoded = self.as_string();
132        !encoded.is_empty() && encoded == expected
133    }
134}
135
136enum SemanticEmbeddingEngine {
137    /// Local ONNX embedder (all-MiniLM-L6-v2 via raw `ort`). The config-facing
138    /// backend string stays "fastembed" for index-fingerprint compatibility.
139    Local(LocalEmbedder),
140    OpenAiCompatible {
141        client: Client,
142        model: String,
143        base_url: String,
144        api_key: Option<String>,
145    },
146    Ollama {
147        client: Client,
148        model: String,
149        base_url: String,
150    },
151}
152
153pub struct SemanticEmbeddingModel {
154    backend: SemanticBackend,
155    model: String,
156    base_url: Option<String>,
157    timeout_ms: u64,
158    max_batch_size: usize,
159    dimension: Option<usize>,
160    engine: SemanticEmbeddingEngine,
161    query_embedding_cache: HashMap<String, Vec<f32>>,
162    query_embedding_cache_order: VecDeque<String>,
163    query_embedding_cache_hits: u64,
164    query_embedding_cache_misses: u64,
165}
166
167pub type EmbeddingModel = SemanticEmbeddingModel;
168
169fn validate_embedding_batch(
170    vectors: &[Vec<f32>],
171    expected_count: usize,
172    context: &str,
173) -> Result<(), String> {
174    if expected_count > 0 && vectors.is_empty() {
175        return Err(format!(
176            "{context} returned no vectors for {expected_count} inputs"
177        ));
178    }
179
180    if vectors.len() != expected_count {
181        return Err(format!(
182            "{context} returned {} vectors for {} inputs",
183            vectors.len(),
184            expected_count
185        ));
186    }
187
188    let Some(first_vector) = vectors.first() else {
189        return Ok(());
190    };
191    let expected_dimension = first_vector.len();
192    validate_embedding_dimension(expected_dimension)
193        .map_err(|error| format!("{context} returned {error}"))?;
194    for (index, vector) in vectors.iter().enumerate() {
195        if vector.len() != expected_dimension {
196            return Err(format!(
197                "{context} returned inconsistent embedding dimensions: vector 0 has length {expected_dimension}, vector {index} has length {}",
198                vector.len()
199            ));
200        }
201    }
202
203    Ok(())
204}
205
206fn validate_embedding_dimension(dimension: usize) -> Result<(), String> {
207    if dimension == 0 || dimension > MAX_DIMENSION {
208        return Err(format!(
209            "invalid embedding dimension: {dimension}; supported range is 1..={MAX_DIMENSION}"
210        ));
211    }
212
213    Ok(())
214}
215
216/// Normalize a base URL: validate scheme and strip trailing slash.
217/// Does NOT perform SSRF/private-IP validation — call
218/// `validate_base_url_no_ssrf` separately when processing user-supplied config.
219fn normalize_base_url(raw: &str) -> Result<String, String> {
220    let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
221    let scheme = parsed.scheme();
222    if scheme != "http" && scheme != "https" {
223        return Err(format!(
224            "unsupported URL scheme '{}' — only http:// and https:// are allowed",
225            scheme
226        ));
227    }
228    Ok(parsed.to_string().trim_end_matches('/').to_string())
229}
230
231/// Validate that a base URL does not point to a private/loopback address.
232/// Call this on user-supplied config (at configure time) to prevent SSRF.
233/// Not called for programmatically constructed configs (e.g. tests).
234///
235/// **Loopback is allowed.** Self-hosted embedding backends (e.g. Ollama at
236/// `http://127.0.0.1:11434`) are a primary use case for `aft_search`. Loopback
237/// addresses by definition cannot be exploited as SSRF targets — they only
238/// reach services on the same machine. Allowing loopback unblocks Ollama at its
239/// default config without opening up SSRF to LAN/intranet services, which
240/// remain rejected.
241///
242/// **mDNS `.local` is rejected.** mDNS hostnames typically resolve to LAN
243/// devices (printers, homelab servers); rejecting them before DNS lookup keeps
244/// the SSRF guard meaningful for non-loopback private networks.
245pub fn validate_base_url_no_ssrf(raw: &str) -> Result<(), String> {
246    use std::net::{IpAddr, ToSocketAddrs};
247
248    let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
249
250    let host = parsed.host_str().unwrap_or("");
251
252    // Loopback hostnames are explicitly allowed. RFC 6761 mandates that
253    // `localhost` and `*.localhost` resolve to loopback;
254    // `localhost.localdomain` is a historical alias used on some Linux
255    // distros. Self-hosted backends like Ollama use these by default.
256    let is_loopback_host =
257        host == "localhost" || host == "localhost.localdomain" || host.ends_with(".localhost");
258    if is_loopback_host {
259        return Ok(());
260    }
261
262    // mDNS hostnames are typically LAN devices, not loopback. Reject before
263    // DNS lookup so users get a clear error rather than a private-IP error.
264    if host.ends_with(".local") {
265        return Err(format!(
266            "base_url host '{host}' is an mDNS name — only loopback (localhost / 127.0.0.1) and public endpoints are allowed"
267        ));
268    }
269
270    // Resolve the hostname. Reject private/link-local/CGNAT IPs but NOT
271    // loopback (which is by definition same-machine and not an SSRF target).
272    let port = parsed.port_or_known_default().unwrap_or(443);
273    let addr_str = format!("{host}:{port}");
274    let addrs: Vec<IpAddr> = addr_str
275        .to_socket_addrs()
276        .map(|iter| iter.map(|sa| sa.ip()).collect())
277        .unwrap_or_default();
278    for ip in &addrs {
279        if is_private_non_loopback_ip(ip) {
280            return Err(format!(
281                "base_url '{raw}' resolves to a private/reserved IP — only loopback (127.0.0.1) and public endpoints are allowed"
282            ));
283        }
284    }
285
286    Ok(())
287}
288
289/// Returns true for IPv4/IPv6 addresses in private/link-local/CGNAT/benchmark/
290/// multicast/reserved ranges, EXCLUDING loopback (127.0.0.0/8 and ::1). Loopback
291/// is considered safe for SSRF purposes (same-machine, e.g. a local Ollama
292/// endpoint) — see [`validate_base_url_no_ssrf`] for rationale.
293///
294/// Delegates to [`crate::url_fetch::is_private_or_reserved_ip`] so there is one
295/// authoritative reserved-range list (the url_fetch copy is the maintained one;
296/// this used to be a drifting subset that missed e.g. 198.18.0.0/15 and the
297/// multicast/reserved blocks). We only re-add the loopback carve-out the
298/// url_fetch guard deliberately does not make.
299fn is_private_non_loopback_ip(ip: &std::net::IpAddr) -> bool {
300    // Canonicalize so an IPv4-mapped loopback (`::ffff:127.0.0.1`) is also
301    // recognized as loopback, matching the prior carve-out.
302    if ip.to_canonical().is_loopback() {
303        return false;
304    }
305    crate::url_fetch::is_private_or_reserved_ip(*ip)
306}
307
308fn build_openai_embeddings_endpoint(base_url: &str) -> String {
309    if base_url.ends_with("/v1") {
310        format!("{base_url}{DEFAULT_OPENAI_EMBEDDING_PATH}")
311    } else {
312        format!("{base_url}/v1{}", DEFAULT_OPENAI_EMBEDDING_PATH)
313    }
314}
315
316fn build_ollama_embeddings_endpoint(base_url: &str) -> String {
317    if base_url.ends_with("/api") {
318        format!("{base_url}/embed")
319    } else {
320        format!("{base_url}{DEFAULT_OLLAMA_EMBEDDING_PATH}")
321    }
322}
323
324fn normalize_api_key(value: Option<String>) -> Option<String> {
325    value.and_then(|token| {
326        let token = token.trim();
327        if token.is_empty() {
328            None
329        } else {
330            Some(token.to_string())
331        }
332    })
333}
334
335fn is_retryable_embedding_status(status: reqwest::StatusCode) -> bool {
336    status.is_server_error() || status == reqwest::StatusCode::TOO_MANY_REQUESTS
337}
338
339/// Local backends (LM Studio, Ollama, llama.cpp) can return a 4xx — usually
340/// 400/409 — while a model is loading or was just unloaded. Only narrowly known
341/// local-backend loading/unloaded payloads are classified transient; generic
342/// 4xx bodies that merely mention phrases like "loading model" remain
343/// permanent so misconfigurations do not retry forever.
344fn embedding_response_body_is_transient(status: reqwest::StatusCode, raw: &str) -> bool {
345    if !matches!(
346        status,
347        reqwest::StatusCode::BAD_REQUEST
348            | reqwest::StatusCode::CONFLICT
349            | reqwest::StatusCode::REQUEST_TIMEOUT
350            | reqwest::StatusCode::LOCKED
351            | reqwest::StatusCode::TOO_EARLY
352    ) {
353        return false;
354    }
355
356    let lower = raw.to_ascii_lowercase();
357    let normalized = lower.trim();
358
359    normalized.contains("model was unloaded while the request was still in queue")
360        || normalized == "model is loading"
361        || normalized.starts_with("model is loading,")
362        || normalized.contains(r#""error":"model is loading"#)
363        || normalized.contains(r#""message":"model is loading"#)
364        || normalized == "model not loaded"
365        || normalized.contains(r#""error":"model not loaded""#)
366        || normalized.contains(r#""message":"model not loaded""#)
367        || normalized == "loading model into memory"
368        || normalized.contains(r#""error":"loading model into memory""#)
369        || normalized.contains(r#""message":"loading model into memory""#)
370        || normalized == "model is being loaded"
371        || normalized.contains(r#""error":"model is being loaded""#)
372        || normalized.contains(r#""message":"model is being loaded""#)
373        || normalized == "model is currently loading"
374        || normalized.contains(r#""error":"model is currently loading""#)
375        || normalized.contains(r#""message":"model is currently loading""#)
376}
377
378fn is_retryable_embedding_error(error: &reqwest::Error) -> bool {
379    error.is_connect()
380}
381
382/// Whether a send-time error means the backend is *unreachable or temporarily
383/// failing* (vs. a real misconfiguration). Broader than the in-request retry
384/// predicate: a per-request timeout is transient for the build/refresh layer
385/// (the model may still be cold-loading) but we don't burn the 3 fast
386/// in-request attempts on it — the build-level retry rides it out instead.
387fn embedding_send_error_is_transient(error: &reqwest::Error) -> bool {
388    error.is_connect() || error.is_timeout()
389}
390
391fn embedding_response_read_error_is_transient(error: &reqwest::Error) -> bool {
392    embedding_send_error_is_transient(error) || error.is_body() || error.is_decode()
393}
394
395/// Stable machine marker prefixed onto embedding error strings whose root cause
396/// is transient — the backend is down, timing out, or returning 5xx/429, not
397/// misconfigured. The build and corpus-refresh layers key retry-vs-give-up on
398/// this marker (see [`embedding_failure_is_transient`]) instead of re-parsing
399/// error text, so transience stays authoritative at the one site that knows it.
400/// Stripped before any user-facing display via [`strip_transient_embedding_marker`].
401pub const TRANSIENT_EMBEDDING_MARKER: &str = "[transient] ";
402
403/// True when an embedding error carries the transient marker — i.e. retrying
404/// once the backend recovers is the right move, not surfacing a hard failure.
405pub fn embedding_failure_is_transient(error: &str) -> bool {
406    error.contains(TRANSIENT_EMBEDDING_MARKER)
407}
408
409/// Remove the machine transient marker so the message is clean for display.
410pub fn strip_transient_embedding_marker(error: &str) -> String {
411    error.replace(TRANSIENT_EMBEDDING_MARKER, "")
412}
413
414fn sleep_before_embedding_retry(attempt_index: usize) {
415    if let Some(delay_ms) = EMBEDDING_REQUEST_BACKOFF_MS.get(attempt_index) {
416        std::thread::sleep(Duration::from_millis(*delay_ms));
417    }
418}
419
420fn send_embedding_request<F>(mut make_request: F, backend_label: &str) -> Result<String, String>
421where
422    F: FnMut() -> reqwest::blocking::RequestBuilder,
423{
424    for attempt_index in 0..EMBEDDING_REQUEST_MAX_ATTEMPTS {
425        let last_attempt = attempt_index + 1 == EMBEDDING_REQUEST_MAX_ATTEMPTS;
426
427        let response = match make_request().send() {
428            Ok(response) => response,
429            Err(error) => {
430                if !last_attempt && is_retryable_embedding_error(&error) {
431                    sleep_before_embedding_retry(attempt_index);
432                    continue;
433                }
434                // Connect/timeout failures mean the backend is unreachable or
435                // cold-loading — mark transient so the build layer rides it out
436                // and self-heals instead of parking the index in `Failed`.
437                let marker = if embedding_send_error_is_transient(&error) {
438                    TRANSIENT_EMBEDDING_MARKER
439                } else {
440                    ""
441                };
442                return Err(format!("{marker}{backend_label} request failed: {error}"));
443            }
444        };
445
446        let status = response.status();
447        let raw = match response.text() {
448            Ok(raw) => raw,
449            Err(error) => {
450                if !last_attempt && embedding_response_read_error_is_transient(&error) {
451                    sleep_before_embedding_retry(attempt_index);
452                    continue;
453                }
454                let marker = if embedding_response_read_error_is_transient(&error) {
455                    TRANSIENT_EMBEDDING_MARKER
456                } else {
457                    ""
458                };
459                return Err(format!(
460                    "{marker}{backend_label} response read failed: {error}"
461                ));
462            }
463        };
464
465        if status.is_success() {
466            return Ok(raw);
467        }
468
469        // A 4xx whose body says the model is loading/unloaded is transient on
470        // local backends (LM Studio/Ollama), so treat it like a retryable
471        // status: ride it out at both the in-request and build-retry layers.
472        let body_transient = embedding_response_body_is_transient(status, &raw);
473        if !last_attempt && (is_retryable_embedding_status(status) || body_transient) {
474            sleep_before_embedding_retry(attempt_index);
475            continue;
476        }
477
478        // 5xx / 429 are server-side and transient — the backend is overloaded
479        // or briefly unavailable, not misconfigured. A 4xx whose body indicates
480        // the model is (un)loading is also transient (local backend mid-swap).
481        // Other 4xx (auth, bad request, model-not-found) is a real error the
482        // user must fix; no marker.
483        let marker = if is_retryable_embedding_status(status) || body_transient {
484            TRANSIENT_EMBEDDING_MARKER
485        } else {
486            ""
487        };
488        return Err(format!(
489            "{marker}{backend_label} request failed (HTTP {}): {}",
490            status, raw
491        ));
492    }
493
494    unreachable!("embedding request retries exhausted without returning")
495}
496
497fn configured_embedding_timeout_ms(config: &SemanticBackendConfig) -> u64 {
498    if config.timeout_ms == 0 {
499        DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS
500    } else {
501        config.timeout_ms
502    }
503}
504
505impl SemanticEmbeddingModel {
506    pub fn from_config(config: &SemanticBackendConfig) -> Result<Self, String> {
507        Self::from_config_with_timeout_ms(config, configured_embedding_timeout_ms(config))
508    }
509
510    pub fn from_config_for_query(config: &SemanticBackendConfig) -> Result<Self, String> {
511        let timeout_ms =
512            configured_embedding_timeout_ms(config).min(DEFAULT_QUERY_EMBEDDING_TIMEOUT_MS);
513        Self::from_config_with_timeout_ms(config, timeout_ms)
514    }
515
516    fn from_config_with_timeout_ms(
517        config: &SemanticBackendConfig,
518        timeout_ms: u64,
519    ) -> Result<Self, String> {
520        let max_batch_size = if config.max_batch_size == 0 {
521            DEFAULT_MAX_BATCH_SIZE
522        } else {
523            config.max_batch_size
524        };
525
526        let api_key_env = normalize_api_key(config.api_key_env.clone());
527        let model = config.model.clone();
528
529        let client = Client::builder()
530            .timeout(Duration::from_millis(timeout_ms))
531            .redirect(reqwest::redirect::Policy::none())
532            .build()
533            .map_err(|error| format!("failed to configure embedding client: {error}"))?;
534
535        let engine = match config.backend {
536            SemanticBackend::Fastembed => {
537                SemanticEmbeddingEngine::Local(LocalEmbedder::new(&model)?)
538            }
539            SemanticBackend::OpenAiCompatible => {
540                let raw = config.base_url.as_ref().ok_or_else(|| {
541                    "base_url is required for openai_compatible backend".to_string()
542                })?;
543                let base_url = normalize_base_url(raw)?;
544
545                let api_key = match api_key_env {
546                    Some(var_name) => Some(env::var(&var_name).map_err(|_| {
547                        format!("missing api_key_env '{var_name}' for openai_compatible backend")
548                    })?),
549                    None => None,
550                };
551
552                SemanticEmbeddingEngine::OpenAiCompatible {
553                    client,
554                    model,
555                    base_url,
556                    api_key,
557                }
558            }
559            SemanticBackend::Ollama => {
560                let raw = config
561                    .base_url
562                    .as_ref()
563                    .ok_or_else(|| "base_url is required for ollama backend".to_string())?;
564                let base_url = normalize_base_url(raw)?;
565
566                SemanticEmbeddingEngine::Ollama {
567                    client,
568                    model,
569                    base_url,
570                }
571            }
572        };
573
574        Ok(Self {
575            backend: config.backend,
576            model: config.model.clone(),
577            base_url: config.base_url.clone(),
578            timeout_ms,
579            max_batch_size,
580            dimension: None,
581            engine,
582            query_embedding_cache: HashMap::new(),
583            query_embedding_cache_order: VecDeque::new(),
584            query_embedding_cache_hits: 0,
585            query_embedding_cache_misses: 0,
586        })
587    }
588
589    pub fn backend(&self) -> SemanticBackend {
590        self.backend
591    }
592
593    pub fn model(&self) -> &str {
594        &self.model
595    }
596
597    pub fn base_url(&self) -> Option<&str> {
598        self.base_url.as_deref()
599    }
600
601    pub fn max_batch_size(&self) -> usize {
602        self.max_batch_size
603    }
604
605    pub fn timeout_ms(&self) -> u64 {
606        self.timeout_ms
607    }
608
609    pub fn fingerprint(
610        &mut self,
611        config: &SemanticBackendConfig,
612    ) -> Result<SemanticIndexFingerprint, String> {
613        let dimension = self.dimension()?;
614        Ok(SemanticIndexFingerprint::from_config(config, dimension))
615    }
616
617    pub fn dimension(&mut self) -> Result<usize, String> {
618        if let Some(dimension) = self.dimension {
619            return Ok(dimension);
620        }
621
622        let dimension = match &mut self.engine {
623            SemanticEmbeddingEngine::Local(model) => {
624                let vectors = model.embed(&["semantic index fingerprint probe".to_string()])?;
625                vectors
626                    .first()
627                    .map(|v| v.len())
628                    .ok_or_else(|| "embedding backend returned no vectors".to_string())?
629            }
630            SemanticEmbeddingEngine::OpenAiCompatible { .. } => {
631                let vectors =
632                    self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
633                vectors
634                    .first()
635                    .map(|v| v.len())
636                    .ok_or_else(|| "embedding backend returned no vectors".to_string())?
637            }
638            SemanticEmbeddingEngine::Ollama { .. } => {
639                let vectors =
640                    self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
641                vectors
642                    .first()
643                    .map(|v| v.len())
644                    .ok_or_else(|| "embedding backend returned no vectors".to_string())?
645            }
646        };
647
648        self.dimension = Some(dimension);
649        Ok(dimension)
650    }
651
652    pub fn embed(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
653        self.embed_texts(texts)
654    }
655
656    pub fn embed_query_cached(&mut self, query: &str) -> Result<Vec<f32>, String> {
657        if let Some(vector) = self.query_embedding_cache.get(query) {
658            self.query_embedding_cache_hits += 1;
659            return Ok(vector.clone());
660        }
661
662        self.query_embedding_cache_misses += 1;
663        let embeddings = self.embed_texts(vec![query.to_string()])?;
664        let vector = embeddings
665            .first()
666            .cloned()
667            .ok_or_else(|| "embedding model returned no query vector".to_string())?;
668
669        if self.query_embedding_cache.len() >= QUERY_EMBEDDING_CACHE_CAP {
670            if let Some(oldest) = self.query_embedding_cache_order.pop_front() {
671                self.query_embedding_cache.remove(&oldest);
672            }
673        }
674        self.query_embedding_cache
675            .insert(query.to_string(), vector.clone());
676        self.query_embedding_cache_order
677            .push_back(query.to_string());
678
679        Ok(vector)
680    }
681
682    pub fn query_embedding_cache_stats(&self) -> (u64, u64, usize) {
683        (
684            self.query_embedding_cache_hits,
685            self.query_embedding_cache_misses,
686            self.query_embedding_cache.len(),
687        )
688    }
689
690    fn embed_texts(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
691        match &mut self.engine {
692            SemanticEmbeddingEngine::Local(model) => model
693                .embed(&texts)
694                .map_err(|error| format!("failed to embed batch: {error}")),
695            SemanticEmbeddingEngine::OpenAiCompatible {
696                client,
697                model,
698                base_url,
699                api_key,
700            } => {
701                let expected_text_count = texts.len();
702                let endpoint = build_openai_embeddings_endpoint(base_url);
703                let body = serde_json::json!({
704                    "input": texts,
705                    "model": model,
706                });
707
708                let raw = send_embedding_request(
709                    || {
710                        // `.json(&body)` sets Content-Type: application/json
711                        // automatically. Do NOT add `.header("Content-Type",
712                        // "application/json")` afterwards — RequestBuilder::header()
713                        // calls HeaderMap::append, which produces TWO Content-Type
714                        // headers on the wire. OpenAI's /v1/embeddings endpoint
715                        // treats duplicate Content-Type as malformed and rejects
716                        // the body with 400 "you must provide a model parameter"
717                        // even when `model` is set. Verified end-to-end against
718                        // api.openai.com. See issue #36.
719                        let mut request = client.post(&endpoint).json(&body);
720
721                        if let Some(api_key) = api_key {
722                            request = request.header("Authorization", format!("Bearer {api_key}"));
723                        }
724
725                        request
726                    },
727                    "openai compatible",
728                )?;
729
730                #[derive(Deserialize)]
731                struct OpenAiResponse {
732                    data: Vec<OpenAiEmbeddingResult>,
733                }
734
735                #[derive(Deserialize)]
736                struct OpenAiEmbeddingResult {
737                    embedding: Vec<f32>,
738                    index: Option<u32>,
739                }
740
741                let parsed: OpenAiResponse = serde_json::from_str(&raw)
742                    .map_err(|error| format!("invalid openai compatible response: {error}"))?;
743                if parsed.data.len() != expected_text_count {
744                    return Err(format!(
745                        "openai compatible response returned {} embeddings for {} inputs",
746                        parsed.data.len(),
747                        expected_text_count
748                    ));
749                }
750
751                let mut vectors = vec![Vec::new(); parsed.data.len()];
752                for (i, item) in parsed.data.into_iter().enumerate() {
753                    let index = item.index.unwrap_or(i as u32) as usize;
754                    if index >= vectors.len() {
755                        return Err(
756                            "openai compatible response contains invalid vector index".to_string()
757                        );
758                    }
759                    vectors[index] = item.embedding;
760                }
761
762                for vector in &vectors {
763                    if vector.is_empty() {
764                        return Err(
765                            "openai compatible response contained missing vectors".to_string()
766                        );
767                    }
768                }
769
770                self.dimension = vectors.first().map(Vec::len);
771                Ok(vectors)
772            }
773            SemanticEmbeddingEngine::Ollama {
774                client,
775                model,
776                base_url,
777            } => {
778                let expected_text_count = texts.len();
779                let endpoint = build_ollama_embeddings_endpoint(base_url);
780
781                #[derive(Serialize)]
782                struct OllamaPayload<'a> {
783                    model: &'a str,
784                    input: Vec<String>,
785                }
786
787                let payload = OllamaPayload {
788                    model,
789                    input: texts,
790                };
791
792                let raw = send_embedding_request(
793                    || {
794                        // `.json(&payload)` sets Content-Type automatically.
795                        // Same duplicate-header trap as the OpenAI branch above
796                        // — most Ollama servers tolerate it, but the
797                        // single-Content-Type form is the correct one.
798                        client.post(&endpoint).json(&payload)
799                    },
800                    "ollama",
801                )?;
802
803                #[derive(Deserialize)]
804                struct OllamaResponse {
805                    embeddings: Vec<Vec<f32>>,
806                }
807
808                let parsed: OllamaResponse = serde_json::from_str(&raw)
809                    .map_err(|error| format!("invalid ollama response: {error}"))?;
810                if parsed.embeddings.is_empty() {
811                    return Err("ollama response returned no embeddings".to_string());
812                }
813                if parsed.embeddings.len() != expected_text_count {
814                    return Err(format!(
815                        "ollama response returned {} embeddings for {} inputs",
816                        parsed.embeddings.len(),
817                        expected_text_count
818                    ));
819                }
820
821                let vectors = parsed.embeddings;
822                for vector in &vectors {
823                    if vector.is_empty() {
824                        return Err("ollama response contained empty embeddings".to_string());
825                    }
826                }
827
828                self.dimension = vectors.first().map(Vec::len);
829                Ok(vectors)
830            }
831        }
832    }
833}
834
835/// Pre-validate ONNX Runtime by attempting a raw dlopen before ort touches it.
836/// This catches broken/incompatible .so files without risking a panic in the ort crate.
837/// Also checks the runtime version via OrtGetApiBase if available.
838pub fn pre_validate_onnx_runtime() -> Result<(), String> {
839    let dylib_path = std::env::var("ORT_DYLIB_PATH").ok();
840
841    #[cfg(any(target_os = "linux", target_os = "macos"))]
842    {
843        #[cfg(target_os = "linux")]
844        let default_name = "libonnxruntime.so";
845        #[cfg(target_os = "macos")]
846        let default_name = "libonnxruntime.dylib";
847
848        let lib_name = dylib_path.as_deref().unwrap_or(default_name);
849
850        unsafe {
851            let c_name = std::ffi::CString::new(lib_name)
852                .map_err(|e| format!("invalid library path: {}", e))?;
853            let handle = libc::dlopen(c_name.as_ptr(), libc::RTLD_NOW);
854            if handle.is_null() {
855                let err = libc::dlerror();
856                let msg = if err.is_null() {
857                    "unknown dlopen error".to_string()
858                } else {
859                    std::ffi::CStr::from_ptr(err).to_string_lossy().into_owned()
860                };
861                return Err(format!(
862                    "ONNX Runtime not found. dlopen('{}') failed: {}. \
863                     Run `npx @cortexkit/aft doctor` to diagnose.",
864                    lib_name, msg
865                ));
866            }
867
868            // Try to detect the runtime version from the actual loaded library
869            // path first. A bare dlopen("libonnxruntime.so") may resolve to an
870            // older system ORT through loader search paths; checking only the
871            // caller-supplied soname would miss that and let ort fail opaquely.
872            let (detected_version, version_source) =
873                detect_ort_version_from_loaded_library(handle, lib_name);
874
875            libc::dlclose(handle);
876
877            // Check version compatibility — we need 1.20+.
878            if let Some(ref version) = detected_version {
879                let parts: Vec<&str> = version.split('.').collect();
880                if let (Some(major), Some(minor)) = (
881                    parts.first().and_then(|s| s.parse::<u32>().ok()),
882                    parts.get(1).and_then(|s| s.parse::<u32>().ok()),
883                ) {
884                    if major != 1 || minor < 20 {
885                        return Err(format_ort_version_mismatch(version, &version_source));
886                    }
887                }
888            }
889        }
890    }
891
892    #[cfg(target_os = "windows")]
893    {
894        // Validate ONNX Runtime availability on Windows by loading the DLL
895        // via LoadLibraryExW before the ort crate attempts its own LoadLibrary.
896        // This way we can produce a friendly error (with installation hints)
897        // instead of a raw LoadLibrary failure from deep inside fastembed.
898        let lib_name = dylib_path.as_deref().unwrap_or("onnxruntime.dll");
899
900        // Use kernel32 LoadLibraryExW for the validation — built-in, no
901        // crate dependency required. GetModuleFileNameW resolves the loaded
902        // DLL path for version probing via the version.dll API.
903        #[link(name = "kernel32")]
904        extern "system" {
905            fn LoadLibraryExW(
906                lpLibFileName: *const u16,
907                hFile: *mut std::ffi::c_void,
908                dwFlags: u32,
909            ) -> *mut std::ffi::c_void;
910            fn FreeLibrary(hLibModule: *mut std::ffi::c_void) -> i32;
911            fn GetModuleFileNameW(
912                hModule: *mut std::ffi::c_void,
913                lpFilename: *mut u16,
914                nSize: u32,
915            ) -> u32;
916        }
917
918        #[link(name = "version")]
919        extern "system" {
920            fn GetFileVersionInfoSizeW(lptstrFilename: *const u16, lpdwHandle: *mut u32) -> u32;
921            fn GetFileVersionInfoW(
922                lptstrFilename: *const u16,
923                dwHandle: u32,
924                dwLen: u32,
925                lpData: *mut std::ffi::c_void,
926            ) -> i32;
927            fn VerQueryValueW(
928                pBlock: *mut std::ffi::c_void,
929                lpSubBlock: *const u16,
930                lplpBuffer: *mut *mut std::ffi::c_void,
931                puLen: *mut u32,
932            ) -> i32;
933        }
934
935        #[repr(C)]
936        struct VS_FIXEDFILEINFO {
937            dw_signature: u32,
938            dw_struc_version: u32,
939            dw_file_version_ms: u32, // HIWORD major, LOWORD minor
940            dw_file_version_ls: u32, // HIWORD build, LOWORD revision
941            dw_product_version_ms: u32,
942            dw_product_version_ls: u32,
943            dw_file_flags_mask: u32,
944            dw_file_flags: u32,
945            dw_file_os: u32,
946            dw_file_type: u32,
947            dw_file_subtype: u32,
948            dw_file_date_ms: u32,
949            dw_file_date_ls: u32,
950        }
951
952        unsafe {
953            use std::os::windows::ffi::OsStrExt;
954            let wide: Vec<u16> = std::ffi::OsStr::new(lib_name)
955                .encode_wide()
956                .chain(std::iter::once(0))
957                .collect();
958
959            let handle = LoadLibraryExW(wide.as_ptr(), std::ptr::null_mut(), 0);
960            if handle.is_null() {
961                let err = std::io::Error::last_os_error();
962                return Err(format!(
963                    "ONNX Runtime not found. LoadLibraryExW('{}') failed: {}. \
964                     Run `npx @cortexkit/aft doctor` to diagnose.",
965                    lib_name, err
966                ));
967            }
968
969            // Probe the file version from PE resources so we can reject
970            // outdated DLLs (e.g. v1.9.x) before the ort crate panics.
971            let mut detected_major: u32 = 0;
972            let mut detected_minor: u32 = 0;
973            // Use MAX_UNICODEPATH (32767) so deeply nested ORT paths (e.g.
974            // long NuGet package paths under %USERPROFILE%) never truncate.
975            // GetModuleFileNameW truncates silently when the buffer is too
976            // small, which causes version probing to fail and the version
977            // check to be bypassed — better to allocate generously.
978            let mut path_buf = [0u16; 32767];
979            let path_len = GetModuleFileNameW(handle, path_buf.as_mut_ptr(), 32767);
980            if path_len > 0 {
981                let mut dummy_handle: u32 = 0;
982                let info_size = GetFileVersionInfoSizeW(path_buf.as_ptr(), &mut dummy_handle);
983                if info_size > 0 {
984                    let mut info = vec![0u8; info_size as usize];
985                    if GetFileVersionInfoW(
986                        path_buf.as_ptr(),
987                        0,
988                        info_size,
989                        info.as_mut_ptr() as *mut std::ffi::c_void,
990                    ) != 0
991                    {
992                        let sub_block = "\\\0".encode_utf16().collect::<Vec<u16>>();
993                        let mut vs_info: *mut std::ffi::c_void = std::ptr::null_mut();
994                        let mut vs_len: u32 = 0;
995                        if VerQueryValueW(
996                            info.as_mut_ptr() as *mut std::ffi::c_void,
997                            sub_block.as_ptr(),
998                            &mut vs_info,
999                            &mut vs_len,
1000                        ) != 0
1001                            && !vs_info.is_null()
1002                        {
1003                            let fixed = vs_info as *const VS_FIXEDFILEINFO;
1004                            detected_major = (*fixed).dw_file_version_ms >> 16;
1005                            detected_minor = (*fixed).dw_file_version_ms & 0xFFFF;
1006                        }
1007                    }
1008                }
1009            }
1010
1011            FreeLibrary(handle);
1012
1013            // Version compatibility check (mirrors the Linux/macOS path).
1014            // If version could not be detected (detected_major == 0) we let
1015            // the load succeed — the ort crate will diagnose further.
1016            if detected_major != 0 && (detected_major != 1 || detected_minor < 20) {
1017                let ver = format!("{}.{}", detected_major, detected_minor);
1018                return Err(format_ort_version_mismatch(&ver, lib_name));
1019            }
1020        }
1021    }
1022
1023    Ok(())
1024}
1025
1026#[cfg(any(target_os = "linux", target_os = "macos"))]
1027unsafe fn loaded_library_path_from_handle(handle: *mut std::ffi::c_void) -> Option<String> {
1028    let symbol_name = std::ffi::CString::new("OrtGetApiBase").ok()?;
1029    let symbol = unsafe { libc::dlsym(handle, symbol_name.as_ptr()) };
1030    if symbol.is_null() {
1031        return None;
1032    }
1033
1034    let mut info = std::mem::MaybeUninit::<libc::Dl_info>::uninit();
1035    if unsafe { libc::dladdr(symbol, info.as_mut_ptr()) } == 0 {
1036        return None;
1037    }
1038
1039    let info = unsafe { info.assume_init() };
1040    if info.dli_fname.is_null() {
1041        return None;
1042    }
1043
1044    Some(
1045        unsafe { std::ffi::CStr::from_ptr(info.dli_fname) }
1046            .to_string_lossy()
1047            .into_owned(),
1048    )
1049}
1050
1051#[cfg(any(target_os = "linux", target_os = "macos"))]
1052fn detect_ort_version_from_resolved_or_requested(
1053    resolved_path: Option<String>,
1054    requested_lib_name: &str,
1055) -> (Option<String>, String) {
1056    if let Some(path) = resolved_path {
1057        if let Some(version) = detect_ort_version_from_path(&path) {
1058            return (Some(version), path);
1059        }
1060        return (detect_ort_version_from_path(requested_lib_name), path);
1061    }
1062
1063    (
1064        detect_ort_version_from_path(requested_lib_name),
1065        requested_lib_name.to_string(),
1066    )
1067}
1068
1069#[cfg(any(target_os = "linux", target_os = "macos"))]
1070fn detect_ort_version_from_loaded_library(
1071    handle: *mut std::ffi::c_void,
1072    requested_lib_name: &str,
1073) -> (Option<String>, String) {
1074    detect_ort_version_from_resolved_or_requested(
1075        unsafe { loaded_library_path_from_handle(handle) },
1076        requested_lib_name,
1077    )
1078}
1079
1080/// Try to extract the ORT version from the library filename or resolved symlink.
1081/// Examples: "libonnxruntime.so.1.19.0" → "1.19.0", "libonnxruntime.1.24.4.dylib" → "1.24.4"
1082#[cfg(any(target_os = "linux", target_os = "macos"))]
1083fn detect_ort_version_from_path(lib_path: &str) -> Option<String> {
1084    let path = std::path::Path::new(lib_path);
1085
1086    // Try the path as given, then follow symlinks
1087    for candidate in [Some(path.to_path_buf()), std::fs::canonicalize(path).ok()]
1088        .into_iter()
1089        .flatten()
1090    {
1091        if let Some(name) = candidate.file_name().and_then(|n| n.to_str()) {
1092            if let Some(version) = extract_version_from_filename(name) {
1093                return Some(version);
1094            }
1095        }
1096    }
1097
1098    // Also check for versioned siblings in the same directory
1099    if let Some(parent) = path.parent() {
1100        if let Ok(entries) = std::fs::read_dir(parent) {
1101            for entry in entries.flatten() {
1102                if let Some(name) = entry.file_name().to_str() {
1103                    if name.starts_with("libonnxruntime") {
1104                        if let Some(version) = extract_version_from_filename(name) {
1105                            return Some(version);
1106                        }
1107                    }
1108                }
1109            }
1110        }
1111    }
1112
1113    None
1114}
1115
1116/// Extract version from filenames like "libonnxruntime.so.1.19.0" or "libonnxruntime.1.24.4.dylib"
1117#[cfg(any(target_os = "linux", target_os = "macos"))]
1118fn extract_version_from_filename(name: &str) -> Option<String> {
1119    // Match patterns: .so.X.Y.Z or .X.Y.Z.dylib or .X.Y.Z.so
1120    let re = regex::Regex::new(r"(\d+\.\d+\.\d+)").ok()?;
1121    re.find(name).map(|m| m.as_str().to_string())
1122}
1123
1124fn suggest_removal_command(lib_path: &str) -> String {
1125    if lib_path.starts_with("/usr/local/lib")
1126        || lib_path == "libonnxruntime.so"
1127        || lib_path == "libonnxruntime.dylib"
1128    {
1129        #[cfg(target_os = "linux")]
1130        return "   sudo rm /usr/local/lib/libonnxruntime* && sudo ldconfig".to_string();
1131        #[cfg(target_os = "macos")]
1132        return "   sudo rm /usr/local/lib/libonnxruntime*".to_string();
1133    }
1134    format!("   rm '{}'", lib_path)
1135}
1136
1137/// Build the user-facing error message for an incompatible ONNX Runtime
1138/// install. Extracted as a pure helper so we can unit-test the wording
1139/// stability — the auto-fix recommendation must always come first because
1140/// it's the only safe option, and the system-rm step must remain present
1141/// because some users prefer the system-wide cleanup path.
1142pub(crate) fn format_ort_version_mismatch(version: &str, lib_name: &str) -> String {
1143    format!(
1144        "ONNX Runtime version mismatch: found v{} at '{}', but AFT requires v1.20+. \
1145         Solutions:\n\
1146         1. Auto-fix (recommended): run `npx @cortexkit/aft doctor --fix`. \
1147         This downloads AFT-managed ONNX Runtime v1.24 into AFT's storage and \
1148         configures the bridge to load it instead of the system library — no \
1149         changes to '{}'.\n\
1150         2. Remove the old library and restart (AFT auto-downloads the correct version on next start):\n\
1151         {}\n\
1152         3. Or install ONNX Runtime 1.24 system-wide: https://github.com/microsoft/onnxruntime/releases/tag/v1.24.0\n\
1153         4. Run `npx @cortexkit/aft doctor` for full diagnostics.",
1154        version,
1155        lib_name,
1156        lib_name,
1157        suggest_removal_command(lib_name),
1158    )
1159}
1160
1161pub fn is_onnx_runtime_unavailable(message: &str) -> bool {
1162    if message.trim_start().starts_with("ONNX Runtime not found.") {
1163        return true;
1164    }
1165
1166    let message = message.to_ascii_lowercase();
1167    let mentions_onnx_runtime = ["onnx runtime", "onnxruntime", "libonnxruntime"]
1168        .iter()
1169        .any(|pattern| message.contains(pattern));
1170    let mentions_dynamic_load_failure = [
1171        "shared library",
1172        "dynamic library",
1173        "failed to load",
1174        "could not load",
1175        "unable to load",
1176        "dlopen",
1177        "loadlibrary",
1178        "no such file",
1179        "not found",
1180    ]
1181    .iter()
1182    .any(|pattern| message.contains(pattern));
1183
1184    mentions_onnx_runtime && mentions_dynamic_load_failure
1185}
1186
1187pub fn format_embedding_init_error(error: impl Display) -> String {
1188    let message = error.to_string();
1189
1190    if is_onnx_runtime_unavailable(&message) {
1191        return format!("{ONNX_RUNTIME_INSTALL_HINT} Original error: {message}");
1192    }
1193
1194    format!("failed to initialize semantic embedding model: {message}")
1195}
1196
1197/// A chunk of code ready for embedding — derived from a Symbol with context enrichment
1198#[derive(Debug, Clone)]
1199pub struct SemanticChunk {
1200    /// Absolute file path
1201    pub file: PathBuf,
1202    /// Symbol name
1203    pub name: String,
1204    /// Fully-qualified symbol name, when known from the outline scope chain.
1205    pub qualified_name: Option<String>,
1206    /// Symbol kind (function, class, struct, etc.)
1207    pub kind: SymbolKind,
1208    /// Line range (0-based internally, inclusive)
1209    pub start_line: u32,
1210    pub end_line: u32,
1211    /// Whether the symbol is exported
1212    pub exported: bool,
1213    /// The enriched text that gets embedded (name + file + kind + signature + body snippet)
1214    pub embed_text: String,
1215    /// Short code snippet for display in results
1216    pub snippet: String,
1217}
1218
1219/// A stored embedding entry — chunk metadata + vector
1220#[derive(Debug, Clone)]
1221pub struct EmbeddingEntry {
1222    chunk: SemanticChunk,
1223    vector: Vec<f32>,
1224}
1225
1226/// The semantic index — stores embeddings for all symbols in a project
1227#[derive(Debug, Clone)]
1228pub struct SemanticIndex {
1229    entries: Vec<EmbeddingEntry>,
1230    /// Track which files are indexed and their mtime for staleness detection
1231    file_mtimes: HashMap<PathBuf, SystemTime>,
1232    /// Track indexed file sizes alongside mtimes for staleness detection
1233    file_sizes: HashMap<PathBuf, u64>,
1234    file_hashes: HashMap<PathBuf, blake3::Hash>,
1235    /// Embedding dimension (384 for MiniLM-L6-v2)
1236    dimension: usize,
1237    fingerprint: Option<SemanticIndexFingerprint>,
1238    project_root: PathBuf,
1239    deferred_files: HashSet<PathBuf>,
1240}
1241
1242#[derive(Debug, Clone, Copy)]
1243struct IndexedFileMetadata {
1244    mtime: SystemTime,
1245    size: u64,
1246    content_hash: blake3::Hash,
1247}
1248
1249/// Result of an incremental refresh of the semantic index. Counts are file
1250/// counts; `total_processed` is the number of current/deleted files considered.
1251#[derive(Debug, Default, Clone, Copy)]
1252pub struct RefreshSummary {
1253    pub changed: usize,
1254    pub added: usize,
1255    pub deleted: usize,
1256    pub total_processed: usize,
1257}
1258
1259impl RefreshSummary {
1260    /// True when no files were touched.
1261    pub fn is_noop(&self) -> bool {
1262        self.changed == 0 && self.added == 0 && self.deleted == 0
1263    }
1264}
1265
1266#[derive(Debug, Default)]
1267pub struct InvalidatedFilesRefresh {
1268    /// Full replacement entries for `completed_paths`, not just newly embedded
1269    /// chunks. `apply_refresh_update` removes completed paths before extending
1270    /// this set, so reused chunks must travel in this delta too.
1271    pub added_entries: Vec<EmbeddingEntry>,
1272    pub updated_metadata: Vec<(PathBuf, FileFreshness)>,
1273    pub completed_paths: Vec<PathBuf>,
1274    pub summary: RefreshSummary,
1275}
1276
1277#[derive(Debug, Clone)]
1278struct ReusableEmbedding {
1279    embed_text: String,
1280    vector: Vec<f32>,
1281}
1282
1283type ChunkReuseMap = HashMap<PathBuf, HashMap<blake3::Hash, Vec<ReusableEmbedding>>>;
1284
1285/// Search result from a semantic query
1286#[derive(Debug, Clone)]
1287pub struct SemanticResult {
1288    pub file: PathBuf,
1289    pub name: String,
1290    pub qualified_name: Option<String>,
1291    pub kind: SymbolKind,
1292    pub start_line: u32,
1293    pub end_line: u32,
1294    pub exported: bool,
1295    pub snippet: String,
1296    pub score: f32,
1297    pub rank_score: f32,
1298    pub cap_protected: bool,
1299    pub source: &'static str,
1300}
1301
1302impl SemanticIndex {
1303    pub fn new(project_root: PathBuf, dimension: usize) -> Self {
1304        debug_assert!(project_root.is_absolute());
1305        Self {
1306            entries: Vec::new(),
1307            file_mtimes: HashMap::new(),
1308            file_sizes: HashMap::new(),
1309            file_hashes: HashMap::new(),
1310            dimension,
1311            fingerprint: None,
1312            project_root,
1313            deferred_files: HashSet::new(),
1314        }
1315    }
1316
1317    /// Number of embedded symbol entries.
1318    pub fn entry_count(&self) -> usize {
1319        self.entries.len()
1320    }
1321
1322    /// Number of files currently tracked by the semantic index.
1323    pub fn indexed_file_count(&self) -> usize {
1324        self.file_mtimes.len()
1325    }
1326
1327    /// Human-readable status label for the index.
1328    pub fn status_label(&self) -> &'static str {
1329        if self.entries.is_empty() {
1330            "empty"
1331        } else {
1332            "ready"
1333        }
1334    }
1335
1336    fn collect_chunks(
1337        project_root: &Path,
1338        files: &[PathBuf],
1339    ) -> (Vec<SemanticChunk>, HashMap<PathBuf, IndexedFileMetadata>) {
1340        let collect_started = std::time::Instant::now();
1341        let per_file: Vec<(
1342            PathBuf,
1343            Result<(IndexedFileMetadata, Vec<SemanticChunk>), String>,
1344        )> = files
1345            .par_iter()
1346            .map_init(HashMap::new, |parsers, file| {
1347                let result = collect_semantic_file(project_root, file, parsers);
1348                (file.clone(), result)
1349            })
1350            .collect();
1351
1352        let mut chunks: Vec<SemanticChunk> = Vec::new();
1353        let mut file_metadata: HashMap<PathBuf, IndexedFileMetadata> = HashMap::new();
1354
1355        for (file, result) in per_file {
1356            match result {
1357                Ok((metadata, file_chunks)) => {
1358                    file_metadata.insert(file, metadata);
1359                    chunks.extend(file_chunks);
1360                }
1361                Err(error) => {
1362                    // "unsupported file extension" is expected for non-code files
1363                    // (json, xml, .gitignore, etc.) that get included in the
1364                    // project walk. Pre-fix this was swallowed by .unwrap_or_default();
1365                    // we now skip silently to keep the log clean. Only real read/parse
1366                    // errors are worth surfacing.
1367                    if error == "unsupported file extension" {
1368                        continue;
1369                    }
1370                    slog_warn!(
1371                        "failed to collect semantic chunks for {}: {}",
1372                        file.display(),
1373                        error
1374                    );
1375                }
1376            }
1377        }
1378
1379        slog_info!(
1380            "semantic collect: {} chunks from {} files in {} ms",
1381            chunks.len(),
1382            file_metadata.len(),
1383            collect_started.elapsed().as_millis()
1384        );
1385
1386        (chunks, file_metadata)
1387    }
1388
1389    fn build_chunk_reuse_map(&self, files: &[PathBuf]) -> ChunkReuseMap {
1390        let requested: HashSet<&Path> = files.iter().map(PathBuf::as_path).collect();
1391        let mut reuse_map: ChunkReuseMap = HashMap::new();
1392
1393        for entry in &self.entries {
1394            if !requested.contains(entry.chunk.file.as_path()) {
1395                continue;
1396            }
1397
1398            // `embed_text` is already persisted in the current on-disk format,
1399            // so refresh-time reuse can hash it in memory and confirm the exact
1400            // string without bumping `SEMANTIC_INDEX_VERSION` and forcing every
1401            // user through a full rebuild.
1402            let hash = blake3::hash(entry.chunk.embed_text.as_bytes());
1403            reuse_map
1404                .entry(entry.chunk.file.clone())
1405                .or_default()
1406                .entry(hash)
1407                .or_default()
1408                .push(ReusableEmbedding {
1409                    embed_text: entry.chunk.embed_text.clone(),
1410                    vector: entry.vector.clone(),
1411                });
1412        }
1413
1414        reuse_map
1415    }
1416
1417    fn reusable_vector_for_chunk(
1418        reuse_map: &ChunkReuseMap,
1419        chunk: &SemanticChunk,
1420    ) -> Option<Vec<f32>> {
1421        let hash = blake3::hash(chunk.embed_text.as_bytes());
1422        reuse_map
1423            .get(&chunk.file)?
1424            .get(&hash)?
1425            .iter()
1426            .find(|candidate| candidate.embed_text == chunk.embed_text)
1427            .map(|candidate| candidate.vector.clone())
1428    }
1429
1430    fn entries_for_chunks_with_reuse<F, P>(
1431        chunks: Vec<SemanticChunk>,
1432        reuse_map: &ChunkReuseMap,
1433        embed_fn: &mut F,
1434        max_batch_size: usize,
1435        initial_observed_dimension: Option<usize>,
1436        refresh_label: &str,
1437        progress: &mut P,
1438    ) -> Result<(Vec<EmbeddingEntry>, Option<usize>), String>
1439    where
1440        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1441        P: FnMut(usize, usize),
1442    {
1443        let total_chunks = chunks.len();
1444        progress(0, total_chunks);
1445
1446        let mut entries_by_chunk: Vec<Option<EmbeddingEntry>> = vec![None; total_chunks];
1447        let mut misses: Vec<(usize, SemanticChunk)> = Vec::new();
1448
1449        for (chunk_index, chunk) in chunks.into_iter().enumerate() {
1450            if let Some(vector) = Self::reusable_vector_for_chunk(reuse_map, &chunk) {
1451                entries_by_chunk[chunk_index] = Some(EmbeddingEntry { chunk, vector });
1452            } else {
1453                misses.push((chunk_index, chunk));
1454            }
1455        }
1456
1457        let mut completed = total_chunks.saturating_sub(misses.len());
1458        if completed > 0 {
1459            progress(completed, total_chunks);
1460        }
1461
1462        let batch_size = max_batch_size.max(1);
1463        let mut observed_dimension = initial_observed_dimension;
1464
1465        for batch_start in (0..misses.len()).step_by(batch_size) {
1466            let batch_end = (batch_start + batch_size).min(misses.len());
1467            let batch_texts: Vec<String> = misses[batch_start..batch_end]
1468                .iter()
1469                .map(|(_, chunk)| chunk.embed_text.clone())
1470                .collect();
1471
1472            let vectors = embed_fn(batch_texts)?;
1473            validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1474
1475            if let Some(dim) = vectors.first().map(|vector| vector.len()) {
1476                match observed_dimension {
1477                    None => observed_dimension = Some(dim),
1478                    Some(expected) if dim != expected => {
1479                        return Err(format!(
1480                            "embedding dimension changed during {refresh_label}: \
1481                             cached index uses {expected}, new vectors use {dim}"
1482                        ));
1483                    }
1484                    _ => {}
1485                }
1486            }
1487
1488            for (i, vector) in vectors.into_iter().enumerate() {
1489                let (chunk_index, chunk) = misses[batch_start + i].clone();
1490                entries_by_chunk[chunk_index] = Some(EmbeddingEntry { chunk, vector });
1491            }
1492
1493            completed += batch_end - batch_start;
1494            progress(completed, total_chunks);
1495        }
1496
1497        let entries = entries_by_chunk
1498            .into_iter()
1499            .map(|entry| entry.expect("semantic refresh accounted for every chunk"))
1500            .collect();
1501
1502        Ok((entries, observed_dimension))
1503    }
1504
1505    fn build_from_chunks<F, P>(
1506        project_root: &Path,
1507        chunks: Vec<SemanticChunk>,
1508        file_metadata: HashMap<PathBuf, IndexedFileMetadata>,
1509        embed_fn: &mut F,
1510        max_batch_size: usize,
1511        mut progress: Option<&mut P>,
1512    ) -> Result<Self, String>
1513    where
1514        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1515        P: FnMut(usize, usize),
1516    {
1517        debug_assert!(project_root.is_absolute());
1518        let total_chunks = chunks.len();
1519
1520        if chunks.is_empty() {
1521            return Ok(Self {
1522                entries: Vec::new(),
1523                file_mtimes: file_metadata
1524                    .iter()
1525                    .map(|(path, metadata)| (path.clone(), metadata.mtime))
1526                    .collect(),
1527                file_sizes: file_metadata
1528                    .iter()
1529                    .map(|(path, metadata)| (path.clone(), metadata.size))
1530                    .collect(),
1531                file_hashes: file_metadata
1532                    .into_iter()
1533                    .map(|(path, metadata)| (path, metadata.content_hash))
1534                    .collect(),
1535                dimension: DEFAULT_DIMENSION,
1536                fingerprint: None,
1537                project_root: project_root.to_path_buf(),
1538                deferred_files: HashSet::new(),
1539            });
1540        }
1541
1542        // Embed in batches
1543        let mut entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1544        let mut expected_dimension: Option<usize> = None;
1545        let batch_size = max_batch_size.max(1);
1546        let embed_started = std::time::Instant::now();
1547        let batch_count = total_chunks.div_ceil(batch_size);
1548        for batch_start in (0..chunks.len()).step_by(batch_size) {
1549            let batch_end = (batch_start + batch_size).min(chunks.len());
1550            let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1551                .iter()
1552                .map(|c| c.embed_text.clone())
1553                .collect();
1554
1555            let vectors = embed_fn(batch_texts)?;
1556            validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1557
1558            // Track consistent dimension across all batches
1559            if let Some(dim) = vectors.first().map(|v| v.len()) {
1560                match expected_dimension {
1561                    None => expected_dimension = Some(dim),
1562                    Some(expected) if dim != expected => {
1563                        return Err(format!(
1564                            "embedding dimension changed across batches: expected {expected}, got {dim}"
1565                        ));
1566                    }
1567                    _ => {}
1568                }
1569            }
1570
1571            for (i, vector) in vectors.into_iter().enumerate() {
1572                let chunk_idx = batch_start + i;
1573                entries.push(EmbeddingEntry {
1574                    chunk: chunks[chunk_idx].clone(),
1575                    vector,
1576                });
1577            }
1578
1579            if let Some(callback) = progress.as_mut() {
1580                callback(entries.len(), total_chunks);
1581            }
1582        }
1583
1584        let embed_ms = embed_started.elapsed().as_millis();
1585        let rate = (total_chunks as u128 * 1000)
1586            .checked_div(embed_ms)
1587            .unwrap_or(0) as u64;
1588        slog_info!(
1589            "semantic embed: {} chunks in {} batches, {} ms ({} chunks/s)",
1590            total_chunks,
1591            batch_count,
1592            embed_ms,
1593            rate
1594        );
1595
1596        let dimension = entries
1597            .first()
1598            .map(|e| e.vector.len())
1599            .unwrap_or(DEFAULT_DIMENSION);
1600
1601        Ok(Self {
1602            entries,
1603            file_mtimes: file_metadata
1604                .iter()
1605                .map(|(path, metadata)| (path.clone(), metadata.mtime))
1606                .collect(),
1607            file_sizes: file_metadata
1608                .iter()
1609                .map(|(path, metadata)| (path.clone(), metadata.size))
1610                .collect(),
1611            file_hashes: file_metadata
1612                .into_iter()
1613                .map(|(path, metadata)| (path, metadata.content_hash))
1614                .collect(),
1615            dimension,
1616            fingerprint: None,
1617            project_root: project_root.to_path_buf(),
1618            deferred_files: HashSet::new(),
1619        })
1620    }
1621
1622    /// Build the semantic index from a set of files using the provided embedding function.
1623    /// `embed_fn` takes a batch of texts and returns a batch of embedding vectors.
1624    pub fn build<F>(
1625        project_root: &Path,
1626        files: &[PathBuf],
1627        embed_fn: &mut F,
1628        max_batch_size: usize,
1629    ) -> Result<Self, String>
1630    where
1631        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1632    {
1633        let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1634        Self::build_from_chunks(
1635            project_root,
1636            chunks,
1637            file_mtimes,
1638            embed_fn,
1639            max_batch_size,
1640            Option::<&mut fn(usize, usize)>::None,
1641        )
1642    }
1643
1644    /// Build the semantic index and report embedding progress using entry counts.
1645    pub fn build_with_progress<F, P>(
1646        project_root: &Path,
1647        files: &[PathBuf],
1648        embed_fn: &mut F,
1649        max_batch_size: usize,
1650        progress: &mut P,
1651    ) -> Result<Self, String>
1652    where
1653        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1654        P: FnMut(usize, usize),
1655    {
1656        let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1657        let total_chunks = chunks.len();
1658        progress(0, total_chunks);
1659        Self::build_from_chunks(
1660            project_root,
1661            chunks,
1662            file_mtimes,
1663            embed_fn,
1664            max_batch_size,
1665            Some(progress),
1666        )
1667    }
1668
1669    /// Incrementally refresh entries for changed/new files only, preserving cached
1670    /// embeddings for unchanged files. Used when loading the index from disk and
1671    /// finding that a small fraction of files have moved on, deleted, or appeared.
1672    ///
1673    /// Returns `RefreshSummary` describing what changed. On success, `self` is
1674    /// mutated in place and remains a valid index.
1675    ///
1676    /// `current_files` is the full set of files the project considers indexable
1677    /// (typically `walk_project_files(...)`). Files in the cache that are no
1678    /// longer in this set are treated as deleted.
1679    pub fn refresh_stale_files<F, P>(
1680        &mut self,
1681        project_root: &Path,
1682        current_files: &[PathBuf],
1683        embed_fn: &mut F,
1684        max_batch_size: usize,
1685        progress: &mut P,
1686    ) -> Result<RefreshSummary, String>
1687    where
1688        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1689        P: FnMut(usize, usize),
1690    {
1691        self.backfill_missing_file_sizes();
1692
1693        // 1. Bucket files into deleted / changed / added.
1694        let current_set: HashSet<&Path> = current_files.iter().map(PathBuf::as_path).collect();
1695        self.deferred_files
1696            .retain(|path| current_set.contains(path.as_path()));
1697        let total_processed = current_set.len() + self.file_mtimes.len()
1698            - self
1699                .file_mtimes
1700                .keys()
1701                .filter(|path| current_set.contains(path.as_path()))
1702                .count();
1703
1704        // Files in cache that disappeared from disk OR are no longer in the
1705        // walked set. Both cases need their entries dropped.
1706        enum IndexedFileCheck {
1707            Deleted(PathBuf),
1708            MissingMetadata(PathBuf),
1709            Verified(PathBuf, FreshnessVerdict),
1710        }
1711
1712        let mut deleted: Vec<PathBuf> = Vec::new();
1713        let mut changed: Vec<PathBuf> = Vec::new();
1714        let indexed_paths: Vec<PathBuf> = self.file_mtimes.keys().cloned().collect();
1715        let mut checks: Vec<Option<IndexedFileCheck>> = Vec::with_capacity(indexed_paths.len());
1716        let mut strict_verify_inputs: Vec<(usize, PathBuf, FileFreshness)> = Vec::new();
1717
1718        for indexed_path in indexed_paths {
1719            let check_index = checks.len();
1720            if !current_set.contains(indexed_path.as_path()) {
1721                checks.push(Some(IndexedFileCheck::Deleted(indexed_path)));
1722                continue;
1723            }
1724            let cached = match (
1725                self.file_mtimes.get(&indexed_path),
1726                self.file_sizes.get(&indexed_path),
1727                self.file_hashes.get(&indexed_path),
1728            ) {
1729                (Some(mtime), Some(size), Some(hash)) => Some(FileFreshness {
1730                    mtime: *mtime,
1731                    size: *size,
1732                    content_hash: *hash,
1733                }),
1734                _ => None,
1735            };
1736            if let Some(freshness) = cached {
1737                strict_verify_inputs.push((check_index, indexed_path, freshness));
1738                checks.push(None);
1739            } else {
1740                checks.push(Some(IndexedFileCheck::MissingMetadata(indexed_path)));
1741            }
1742        }
1743
1744        for (check_index, path, verdict) in
1745            cache_freshness::verify_files_strict_bounded(strict_verify_inputs)
1746        {
1747            checks[check_index] = Some(IndexedFileCheck::Verified(path, verdict));
1748        }
1749
1750        for check in checks {
1751            match check.expect("strict freshness check should be populated") {
1752                IndexedFileCheck::Deleted(path) => deleted.push(path),
1753                IndexedFileCheck::MissingMetadata(path) => changed.push(path),
1754                IndexedFileCheck::Verified(_path, FreshnessVerdict::HotFresh) => {}
1755                IndexedFileCheck::Verified(
1756                    path,
1757                    FreshnessVerdict::ContentFresh {
1758                        new_mtime,
1759                        new_size,
1760                    },
1761                ) => {
1762                    self.file_mtimes.insert(path.clone(), new_mtime);
1763                    self.file_sizes.insert(path, new_size);
1764                }
1765                IndexedFileCheck::Verified(
1766                    path,
1767                    FreshnessVerdict::Stale | FreshnessVerdict::Deleted,
1768                ) => {
1769                    changed.push(path);
1770                }
1771            }
1772        }
1773
1774        // Files in walk that were never indexed.
1775        let mut added: Vec<PathBuf> = Vec::new();
1776        for path in current_files {
1777            if !self.file_mtimes.contains_key(path) {
1778                added.push(path.clone());
1779            }
1780        }
1781
1782        // Fast path: nothing to do.
1783        if deleted.is_empty() && changed.is_empty() && added.is_empty() {
1784            progress(0, 0);
1785            return Ok(RefreshSummary {
1786                total_processed,
1787                ..RefreshSummary::default()
1788            });
1789        }
1790
1791        // 2. Drop entries for deleted files immediately. Changed files are only
1792        //    replaced after successful re-extraction + embedding so transient
1793        //    read/parse errors keep the stale-but-valid cache entry.
1794        if !deleted.is_empty() {
1795            self.remove_indexed_files(&deleted);
1796        }
1797
1798        // 3. Embed the changed + added set, if any.
1799        let mut to_embed: Vec<PathBuf> = Vec::with_capacity(changed.len() + added.len());
1800        to_embed.extend(changed.iter().cloned());
1801        to_embed.extend(added.iter().cloned());
1802
1803        if to_embed.is_empty() {
1804            // Only deletions happened.
1805            progress(0, 0);
1806            return Ok(RefreshSummary {
1807                changed: 0,
1808                added: 0,
1809                deleted: deleted.len(),
1810                total_processed,
1811            });
1812        }
1813
1814        let reuse_map = self.build_chunk_reuse_map(&changed);
1815        let (chunks, fresh_metadata) = Self::collect_chunks(project_root, &to_embed);
1816        let changed_set: HashSet<&Path> = changed.iter().map(PathBuf::as_path).collect();
1817        let vanished = to_embed
1818            .iter()
1819            .filter(|path| {
1820                changed_set.contains(path.as_path())
1821                    && !fresh_metadata.contains_key(*path)
1822                    && !path.exists()
1823            })
1824            .cloned()
1825            .collect::<Vec<_>>();
1826        if !vanished.is_empty() {
1827            self.remove_indexed_files(&vanished);
1828            deleted.extend(vanished);
1829        }
1830
1831        if chunks.is_empty() {
1832            progress(0, 0);
1833            let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1834            for file in &successful_files {
1835                self.deferred_files.remove(file);
1836            }
1837            if !successful_files.is_empty() {
1838                self.entries
1839                    .retain(|entry| !successful_files.contains(&entry.chunk.file));
1840            }
1841            let changed_count = changed
1842                .iter()
1843                .filter(|path| successful_files.contains(*path))
1844                .count();
1845            let added_count = added
1846                .iter()
1847                .filter(|path| successful_files.contains(*path))
1848                .count();
1849            for (file, metadata) in fresh_metadata {
1850                self.file_mtimes.insert(file.clone(), metadata.mtime);
1851                self.file_sizes.insert(file.clone(), metadata.size);
1852                self.file_hashes.insert(file.clone(), metadata.content_hash);
1853            }
1854            return Ok(RefreshSummary {
1855                changed: changed_count,
1856                added: added_count,
1857                deleted: deleted.len(),
1858                total_processed,
1859            });
1860        }
1861
1862        // 4. Build the full replacement set, reusing cached vectors for chunks
1863        //    whose embed_text is unchanged and embedding only cache misses.
1864        let existing_dimension = if self.entries.is_empty() {
1865            None
1866        } else {
1867            Some(self.dimension)
1868        };
1869        let (new_entries, observed_dimension) = Self::entries_for_chunks_with_reuse(
1870            chunks,
1871            &reuse_map,
1872            embed_fn,
1873            max_batch_size,
1874            existing_dimension,
1875            "incremental refresh",
1876            progress,
1877        )?;
1878
1879        let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1880        for file in &successful_files {
1881            self.deferred_files.remove(file);
1882        }
1883        if !successful_files.is_empty() {
1884            self.entries
1885                .retain(|entry| !successful_files.contains(&entry.chunk.file));
1886        }
1887
1888        self.entries.extend(new_entries);
1889        for (file, metadata) in fresh_metadata {
1890            self.file_mtimes.insert(file.clone(), metadata.mtime);
1891            self.file_sizes.insert(file.clone(), metadata.size);
1892            self.file_hashes.insert(file, metadata.content_hash);
1893        }
1894        if let Some(dim) = observed_dimension {
1895            self.dimension = dim;
1896        }
1897
1898        Ok(RefreshSummary {
1899            changed: changed
1900                .iter()
1901                .filter(|path| successful_files.contains(*path))
1902                .count(),
1903            added: added
1904                .iter()
1905                .filter(|path| successful_files.contains(*path))
1906                .count(),
1907            deleted: deleted.len(),
1908            total_processed,
1909        })
1910    }
1911
1912    /// Refresh exactly the files invalidated by the live watcher, without
1913    /// treating the provided path list as the whole project. This is the
1914    /// watcher-side counterpart to `refresh_stale_files`: it drops any stale
1915    /// entries for the requested paths from this in-memory index, re-extracts
1916    /// whatever still exists on disk, embeds those chunks, and returns the
1917    /// delta needed for another in-memory index to apply the same update.
1918    pub fn refresh_invalidated_files<F, P>(
1919        &mut self,
1920        project_root: &Path,
1921        paths: &[PathBuf],
1922        embed_fn: &mut F,
1923        max_batch_size: usize,
1924        max_files: usize,
1925        progress: &mut P,
1926    ) -> Result<InvalidatedFilesRefresh, String>
1927    where
1928        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1929        P: FnMut(usize, usize),
1930    {
1931        self.backfill_missing_file_sizes();
1932
1933        self.deferred_files.retain(|path| path.exists());
1934        let mut requested_paths = paths.to_vec();
1935        requested_paths.extend(self.deferred_files.iter().cloned());
1936        requested_paths.sort();
1937        requested_paths.dedup();
1938        let total_processed = requested_paths.len();
1939
1940        if requested_paths.is_empty() {
1941            progress(0, 0);
1942            return Ok(InvalidatedFilesRefresh {
1943                summary: RefreshSummary {
1944                    total_processed,
1945                    ..RefreshSummary::default()
1946                },
1947                ..InvalidatedFilesRefresh::default()
1948            });
1949        }
1950
1951        let previously_indexed: HashSet<PathBuf> = requested_paths
1952            .iter()
1953            .filter(|path| self.file_mtimes.contains_key(*path))
1954            .cloned()
1955            .collect();
1956        let reuse_map = self.build_chunk_reuse_map(&requested_paths);
1957
1958        // The watcher path has already invalidated these files in the request
1959        // thread's live index. Mirror that behavior here before inserting any
1960        // fresh chunks so parse/read failures do not resurrect stale entries.
1961        self.remove_indexed_files(&requested_paths);
1962
1963        let existing_paths = requested_paths
1964            .iter()
1965            .filter(|path| path.exists())
1966            .cloned()
1967            .collect::<Vec<_>>();
1968        let deleted = requested_paths
1969            .iter()
1970            .filter(|path| !path.exists() && previously_indexed.contains(path.as_path()))
1971            .count();
1972
1973        if existing_paths.is_empty() {
1974            for path in &requested_paths {
1975                if !path.exists() {
1976                    self.deferred_files.remove(path);
1977                }
1978            }
1979            progress(0, 0);
1980            return Ok(InvalidatedFilesRefresh {
1981                completed_paths: requested_paths,
1982                summary: RefreshSummary {
1983                    deleted,
1984                    total_processed,
1985                    ..RefreshSummary::default()
1986                },
1987                ..InvalidatedFilesRefresh::default()
1988            });
1989        }
1990
1991        let (mut chunks, mut fresh_metadata) = Self::collect_chunks(project_root, &existing_paths);
1992
1993        let retained_file_count = self.file_mtimes.len();
1994        let changed_successful_count = existing_paths
1995            .iter()
1996            .filter(|path| {
1997                previously_indexed.contains(path.as_path()) && fresh_metadata.contains_key(*path)
1998            })
1999            .count();
2000        let available_new_files =
2001            max_files.saturating_sub(retained_file_count.saturating_add(changed_successful_count));
2002        let new_successful_files = existing_paths
2003            .iter()
2004            .filter(|path| {
2005                !previously_indexed.contains(path.as_path()) && fresh_metadata.contains_key(*path)
2006            })
2007            .cloned()
2008            .collect::<Vec<_>>();
2009        if new_successful_files.len() > available_new_files {
2010            let allowed_new_files = new_successful_files
2011                .iter()
2012                .take(available_new_files)
2013                .cloned()
2014                .collect::<HashSet<_>>();
2015            let deferred_new_files = new_successful_files
2016                .into_iter()
2017                .filter(|path| !allowed_new_files.contains(path))
2018                .collect::<HashSet<_>>();
2019
2020            fresh_metadata.retain(|file, _| {
2021                previously_indexed.contains(file.as_path()) || allowed_new_files.contains(file)
2022            });
2023            chunks.retain(|chunk| !deferred_new_files.contains(&chunk.file));
2024
2025            if !deferred_new_files.is_empty() {
2026                for path in &deferred_new_files {
2027                    self.deferred_files.insert(path.clone());
2028                }
2029                slog_warn!(
2030                    "semantic refresh deferred {} new file(s): indexed-file cap {} is reached",
2031                    deferred_new_files.len(),
2032                    max_files
2033                );
2034            }
2035        }
2036
2037        let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
2038        for file in &successful_files {
2039            self.deferred_files.remove(file);
2040        }
2041        let changed = successful_files
2042            .iter()
2043            .filter(|path| previously_indexed.contains(path.as_path()))
2044            .count();
2045        let added = successful_files.len().saturating_sub(changed);
2046        let mut updated_metadata = Vec::with_capacity(fresh_metadata.len());
2047
2048        if chunks.is_empty() {
2049            progress(0, 0);
2050            for (file, metadata) in fresh_metadata {
2051                let freshness = FileFreshness {
2052                    mtime: metadata.mtime,
2053                    size: metadata.size,
2054                    content_hash: metadata.content_hash,
2055                };
2056                self.file_mtimes.insert(file.clone(), freshness.mtime);
2057                self.file_sizes.insert(file.clone(), freshness.size);
2058                self.file_hashes
2059                    .insert(file.clone(), freshness.content_hash);
2060                updated_metadata.push((file, freshness));
2061            }
2062
2063            return Ok(InvalidatedFilesRefresh {
2064                updated_metadata,
2065                completed_paths: requested_paths,
2066                summary: RefreshSummary {
2067                    changed,
2068                    added,
2069                    deleted,
2070                    total_processed,
2071                },
2072                ..InvalidatedFilesRefresh::default()
2073            });
2074        }
2075
2076        let initial_observed_dimension = if self.entries.is_empty() && previously_indexed.is_empty()
2077        {
2078            None
2079        } else {
2080            Some(self.dimension)
2081        };
2082        let (new_entries, observed_dimension) = Self::entries_for_chunks_with_reuse(
2083            chunks,
2084            &reuse_map,
2085            embed_fn,
2086            max_batch_size,
2087            initial_observed_dimension,
2088            "invalidated-file refresh",
2089            progress,
2090        )?;
2091
2092        let added_entries = new_entries.clone();
2093        self.entries.extend(new_entries);
2094        for (file, metadata) in fresh_metadata {
2095            let freshness = FileFreshness {
2096                mtime: metadata.mtime,
2097                size: metadata.size,
2098                content_hash: metadata.content_hash,
2099            };
2100            self.file_mtimes.insert(file.clone(), freshness.mtime);
2101            self.file_sizes.insert(file.clone(), freshness.size);
2102            self.file_hashes
2103                .insert(file.clone(), freshness.content_hash);
2104            updated_metadata.push((file, freshness));
2105        }
2106        if let Some(dim) = observed_dimension {
2107            self.dimension = dim;
2108        }
2109
2110        Ok(InvalidatedFilesRefresh {
2111            added_entries,
2112            updated_metadata,
2113            completed_paths: requested_paths,
2114            summary: RefreshSummary {
2115                changed,
2116                added,
2117                deleted,
2118                total_processed,
2119            },
2120        })
2121    }
2122
2123    pub fn apply_refresh_update(
2124        &mut self,
2125        added_entries: Vec<EmbeddingEntry>,
2126        updated_metadata: Vec<(PathBuf, FileFreshness)>,
2127        completed_paths: &[PathBuf],
2128    ) {
2129        // `added_entries` is the complete replacement set for completed paths:
2130        // freshly embedded misses plus reused chunks carrying refreshed metadata.
2131        // Removing first is safe only because producers include both kinds.
2132        self.remove_indexed_files(completed_paths);
2133
2134        let observed_dimension = added_entries.first().map(|entry| entry.vector.len());
2135        self.entries.extend(added_entries);
2136        for (file, freshness) in updated_metadata {
2137            self.file_mtimes.insert(file.clone(), freshness.mtime);
2138            self.file_sizes.insert(file.clone(), freshness.size);
2139            self.file_hashes.insert(file, freshness.content_hash);
2140        }
2141        if let Some(dim) = observed_dimension {
2142            self.dimension = dim;
2143        }
2144    }
2145
2146    fn remove_indexed_files(&mut self, files: &[PathBuf]) {
2147        let deleted_set: HashSet<&Path> = files.iter().map(PathBuf::as_path).collect();
2148        self.entries
2149            .retain(|entry| !deleted_set.contains(entry.chunk.file.as_path()));
2150        for path in files {
2151            self.file_mtimes.remove(path);
2152            self.file_sizes.remove(path);
2153            self.file_hashes.remove(path);
2154        }
2155    }
2156
2157    /// Search the index with a query embedding, returning top-K results sorted by relevance
2158    pub fn search(&self, query_vector: &[f32], top_k: usize) -> Vec<SemanticResult> {
2159        if self.entries.is_empty() || query_vector.len() != self.dimension {
2160            return Vec::new();
2161        }
2162
2163        let mut scored: Vec<(f32, usize)> = self
2164            .entries
2165            .iter()
2166            .enumerate()
2167            .map(|(i, entry)| {
2168                let mut score = cosine_similarity(query_vector, &entry.vector);
2169                if entry.chunk.exported {
2170                    score *= 1.1;
2171                }
2172                (score, i)
2173            })
2174            .collect();
2175
2176        let keep = top_k.min(scored.len());
2177        if keep == 0 {
2178            return Vec::new();
2179        }
2180
2181        if keep < scored.len() {
2182            scored.select_nth_unstable_by(keep, semantic_score_order);
2183            scored.truncate(keep);
2184        }
2185        scored.sort_by(semantic_score_order);
2186
2187        scored
2188            .into_iter()
2189            // Keep the selected best-first slice mapped without reintroducing the
2190            // old `> 0.0` floor: top_k has already been selected, and zero-score
2191            // tail entries remain observable when requested.
2192            .map(|(score, idx)| {
2193                let entry = &self.entries[idx];
2194                SemanticResult {
2195                    file: entry.chunk.file.clone(),
2196                    name: entry.chunk.name.clone(),
2197                    qualified_name: entry.chunk.qualified_name.clone(),
2198                    kind: entry.chunk.kind.clone(),
2199                    start_line: entry.chunk.start_line,
2200                    end_line: entry.chunk.end_line,
2201                    exported: entry.chunk.exported,
2202                    snippet: entry.chunk.snippet.clone(),
2203                    score,
2204                    rank_score: score,
2205                    cap_protected: false,
2206                    source: "semantic",
2207                }
2208            })
2209            .collect()
2210    }
2211
2212    /// Number of indexed entries
2213    pub fn len(&self) -> usize {
2214        self.entries.len()
2215    }
2216
2217    /// Check if a file needs re-indexing based on mtime/size
2218    pub fn is_file_stale(&self, file: &Path) -> bool {
2219        let Some(stored_mtime) = self.file_mtimes.get(file) else {
2220            return true;
2221        };
2222        let Some(stored_size) = self.file_sizes.get(file) else {
2223            return true;
2224        };
2225        let Some(stored_hash) = self.file_hashes.get(file) else {
2226            return true;
2227        };
2228        let cached = FileFreshness {
2229            mtime: *stored_mtime,
2230            size: *stored_size,
2231            content_hash: *stored_hash,
2232        };
2233        match cache_freshness::verify_file_strict(file, &cached) {
2234            FreshnessVerdict::HotFresh => false,
2235            FreshnessVerdict::ContentFresh { .. } => false,
2236            FreshnessVerdict::Stale | FreshnessVerdict::Deleted => true,
2237        }
2238    }
2239
2240    fn backfill_missing_file_sizes(&mut self) {
2241        for path in self.file_mtimes.keys() {
2242            if self.file_sizes.contains_key(path) {
2243                continue;
2244            }
2245            if let Ok(metadata) = fs::metadata(path) {
2246                self.file_sizes.insert(path.clone(), metadata.len());
2247                if let Ok(Some(hash)) = cache_freshness::hash_file_if_small(path, metadata.len()) {
2248                    self.file_hashes.insert(path.clone(), hash);
2249                }
2250            }
2251        }
2252    }
2253
2254    /// Remove entries for a specific file
2255    pub fn remove_file(&mut self, file: &Path) {
2256        self.invalidate_file(file);
2257    }
2258
2259    pub fn invalidate_file(&mut self, file: &Path) {
2260        let canonical_file = canonicalize_existing_or_deleted_path(file);
2261        self.entries
2262            .retain(|e| e.chunk.file != file && e.chunk.file != canonical_file);
2263        self.file_mtimes.remove(file);
2264        self.file_sizes.remove(file);
2265        self.file_hashes.remove(file);
2266        if canonical_file.as_path() != file {
2267            self.file_mtimes.remove(&canonical_file);
2268            self.file_sizes.remove(&canonical_file);
2269            self.file_hashes.remove(&canonical_file);
2270        }
2271    }
2272
2273    /// Get the embedding dimension
2274    pub fn dimension(&self) -> usize {
2275        self.dimension
2276    }
2277
2278    pub fn fingerprint(&self) -> Option<&SemanticIndexFingerprint> {
2279        self.fingerprint.as_ref()
2280    }
2281
2282    pub fn backend_label(&self) -> Option<&str> {
2283        self.fingerprint.as_ref().map(|f| f.backend.as_str())
2284    }
2285
2286    pub fn model_label(&self) -> Option<&str> {
2287        self.fingerprint.as_ref().map(|f| f.model.as_str())
2288    }
2289
2290    pub fn set_fingerprint(&mut self, fingerprint: SemanticIndexFingerprint) {
2291        self.fingerprint = Some(fingerprint);
2292    }
2293
2294    /// Write the semantic index to disk using atomic temp+rename pattern
2295    pub fn write_to_disk(&self, storage_dir: &Path, project_key: &str) {
2296        // Don't persist empty indexes — they would be loaded on next startup
2297        // and prevent a fresh build that might find files.
2298        if self.entries.is_empty() {
2299            slog_info!("skipping semantic index persistence (0 entries)");
2300            return;
2301        }
2302        let dir = storage_dir.join("semantic").join(project_key);
2303        if let Err(e) = fs::create_dir_all(&dir) {
2304            slog_warn!("failed to create semantic cache dir: {}", e);
2305            return;
2306        }
2307        let data_path = dir.join("semantic.bin");
2308        let tmp_path = dir.join(format!(
2309            "semantic.bin.tmp.{}.{}",
2310            std::process::id(),
2311            SystemTime::now()
2312                .duration_since(SystemTime::UNIX_EPOCH)
2313                .unwrap_or(Duration::ZERO)
2314                .as_nanos()
2315        ));
2316        let write_result = (|| -> io::Result<usize> {
2317            let file = fs::File::create(&tmp_path)?;
2318            let mut writer = BufWriter::new(file);
2319            let bytes_written = self.write_to_writer(&mut writer)?;
2320            writer.flush()?;
2321            writer.get_ref().sync_all()?;
2322            Ok(bytes_written)
2323        })();
2324        let bytes_written = match write_result {
2325            Ok(bytes_written) => bytes_written,
2326            Err(e) => {
2327                slog_warn!("failed to write semantic index: {}", e);
2328                let _ = fs::remove_file(&tmp_path);
2329                return;
2330            }
2331        };
2332        if let Err(e) = fs::rename(&tmp_path, &data_path) {
2333            slog_warn!("failed to rename semantic index: {}", e);
2334            let _ = fs::remove_file(&tmp_path);
2335            return;
2336        }
2337        slog_info!(
2338            "semantic index persisted: {} entries, {:.1} KB",
2339            self.entries.len(),
2340            bytes_written as f64 / 1024.0
2341        );
2342    }
2343
2344    /// Read the semantic index from disk
2345    pub fn read_from_disk(
2346        storage_dir: &Path,
2347        project_key: &str,
2348        current_canonical_root: &Path,
2349        is_worktree_bridge: bool,
2350        expected_fingerprint: Option<&str>,
2351    ) -> Option<Self> {
2352        debug_assert!(current_canonical_root.is_absolute());
2353        let data_path = storage_dir
2354            .join("semantic")
2355            .join(project_key)
2356            .join("semantic.bin");
2357        let file = fs::File::open(&data_path).ok()?;
2358        let file_len = usize::try_from(file.metadata().ok()?.len()).ok()?;
2359        if file_len < HEADER_BYTES_V1 {
2360            slog_warn!(
2361                "corrupt semantic index (too small: {} bytes), removing",
2362                file_len
2363            );
2364            if !is_worktree_bridge {
2365                let _ = fs::remove_file(&data_path);
2366            }
2367            return None;
2368        }
2369
2370        let mut reader = BufReader::new(file);
2371        let mut version_buf = [0u8; 1];
2372        reader.read_exact(&mut version_buf).ok()?;
2373        let version = version_buf[0];
2374        if version != SEMANTIC_INDEX_VERSION_V6 && version != SEMANTIC_INDEX_VERSION_V7 {
2375            slog_info!(
2376                "cached semantic index version {} is not compatible with {}, rebuilding",
2377                version,
2378                SEMANTIC_INDEX_VERSION_V7
2379            );
2380            if !is_worktree_bridge {
2381                let _ = fs::remove_file(&data_path);
2382            }
2383            return None;
2384        }
2385        match Self::from_reader_after_version(
2386            reader,
2387            version,
2388            current_canonical_root,
2389            Some(file_len),
2390            1,
2391        ) {
2392            Ok(index) => {
2393                if index.entries.is_empty() {
2394                    slog_info!("cached semantic index is empty, will rebuild");
2395                    if !is_worktree_bridge {
2396                        let _ = fs::remove_file(&data_path);
2397                    }
2398                    return None;
2399                }
2400                if let Some(expected) = expected_fingerprint {
2401                    let matches = index
2402                        .fingerprint()
2403                        .map(|fingerprint| fingerprint.matches_expected(expected))
2404                        .unwrap_or(false);
2405                    if !matches {
2406                        slog_info!("cached semantic index fingerprint mismatch, rebuilding");
2407                        if !is_worktree_bridge {
2408                            let _ = fs::remove_file(&data_path);
2409                        }
2410                        return None;
2411                    }
2412                }
2413                slog_info!(
2414                    "loaded semantic index from disk: {} entries",
2415                    index.entries.len()
2416                );
2417                Some(index)
2418            }
2419            Err(e) => {
2420                slog_warn!("corrupt semantic index, rebuilding: {}", e);
2421                if !is_worktree_bridge {
2422                    let _ = fs::remove_file(&data_path);
2423                }
2424                None
2425            }
2426        }
2427    }
2428
2429    /// Serialize the index to bytes for disk persistence
2430    pub fn to_bytes(&self) -> Vec<u8> {
2431        let mut buf = Vec::new();
2432        self.write_to_writer(&mut buf)
2433            .expect("writing semantic index to Vec cannot fail");
2434        buf
2435    }
2436
2437    fn write_to_writer<W: Write>(&self, writer: &mut W) -> io::Result<usize> {
2438        let mut bytes_written = 0usize;
2439        let fingerprint = self.fingerprint.as_ref().and_then(|fingerprint| {
2440            let encoded = fingerprint.as_string();
2441            if encoded.is_empty() {
2442                None
2443            } else {
2444                Some(encoded)
2445            }
2446        });
2447        let fp_bytes_ref = fingerprint.as_deref().map(str::as_bytes).unwrap_or(&[]);
2448        let file_mtime_count = self
2449            .file_mtimes
2450            .iter()
2451            .filter(|(path, _)| cache_relative_path(&self.project_root, path).is_some())
2452            .count();
2453        let entry_count = self
2454            .entries
2455            .iter()
2456            .filter(|entry| cache_relative_path(&self.project_root, &entry.chunk.file).is_some())
2457            .count();
2458
2459        // Header: version(1) + dimension(4) + entry_count(4) + fingerprint_len(4) + fingerprint
2460        //
2461        // V7 is the single write format. Layout extends V6 with per-entry
2462        // qualified_name metadata while preserving the embedding fingerprint:
2463        //   - fingerprint is always represented (absent ⇒ fingerprint_len=0,
2464        //     no bytes follow). Uniform format simplifies the reader.
2465        //   - paths are relative to project_root.
2466        //   - file metadata stored as secs(u64) + subsec_nanos(u32) + size(u64) + blake3(32).
2467        //     Preserves full APFS/ext4/NTFS precision and catches mtime ties.
2468        //
2469        // V1/V2 remain readable for backward compatibility (see from_bytes).
2470        // V3/V4 load as compatible formats but are rejected on disk so snippets
2471        // and file sizes are rebuilt once. V6 remains accepted on disk and
2472        // yields qualified_name=None until the next V7 write.
2473        let version = SEMANTIC_INDEX_VERSION_V7;
2474        write_counted(writer, &[version], &mut bytes_written)?;
2475        write_counted(
2476            writer,
2477            &(self.dimension as u32).to_le_bytes(),
2478            &mut bytes_written,
2479        )?;
2480        write_counted(
2481            writer,
2482            &(entry_count as u32).to_le_bytes(),
2483            &mut bytes_written,
2484        )?;
2485        write_counted(
2486            writer,
2487            &(fp_bytes_ref.len() as u32).to_le_bytes(),
2488            &mut bytes_written,
2489        )?;
2490        write_counted(writer, fp_bytes_ref, &mut bytes_written)?;
2491
2492        // File mtime table: count(4) + entries
2493        // V3 layout per entry: path_len(4) + path + secs(8) + subsec_nanos(4)
2494        write_counted(
2495            writer,
2496            &(file_mtime_count as u32).to_le_bytes(),
2497            &mut bytes_written,
2498        )?;
2499        for (path, mtime) in &self.file_mtimes {
2500            let Some(relative) = cache_relative_path(&self.project_root, path) else {
2501                continue;
2502            };
2503            let relative = relative.to_string_lossy();
2504            let path_bytes = relative.as_bytes();
2505            write_counted(
2506                writer,
2507                &(path_bytes.len() as u32).to_le_bytes(),
2508                &mut bytes_written,
2509            )?;
2510            write_counted(writer, path_bytes, &mut bytes_written)?;
2511            let duration = mtime
2512                .duration_since(SystemTime::UNIX_EPOCH)
2513                .unwrap_or_default();
2514            write_counted(
2515                writer,
2516                &duration.as_secs().to_le_bytes(),
2517                &mut bytes_written,
2518            )?;
2519            write_counted(
2520                writer,
2521                &duration.subsec_nanos().to_le_bytes(),
2522                &mut bytes_written,
2523            )?;
2524            let size = self.file_sizes.get(path).copied().unwrap_or_default();
2525            write_counted(writer, &size.to_le_bytes(), &mut bytes_written)?;
2526            let hash = self
2527                .file_hashes
2528                .get(path)
2529                .copied()
2530                .unwrap_or_else(cache_freshness::zero_hash);
2531            write_counted(writer, hash.as_bytes(), &mut bytes_written)?;
2532        }
2533
2534        // Entries: each is metadata + vector
2535        for entry in &self.entries {
2536            let Some(relative) = cache_relative_path(&self.project_root, &entry.chunk.file) else {
2537                continue;
2538            };
2539            let c = &entry.chunk;
2540
2541            // File path
2542            let relative = relative.to_string_lossy();
2543            let file_bytes = relative.as_bytes();
2544            write_counted(
2545                writer,
2546                &(file_bytes.len() as u32).to_le_bytes(),
2547                &mut bytes_written,
2548            )?;
2549            write_counted(writer, file_bytes, &mut bytes_written)?;
2550
2551            // Name
2552            let name_bytes = c.name.as_bytes();
2553            write_counted(
2554                writer,
2555                &(name_bytes.len() as u32).to_le_bytes(),
2556                &mut bytes_written,
2557            )?;
2558            write_counted(writer, name_bytes, &mut bytes_written)?;
2559
2560            // Qualified name (V7 metadata; absent is encoded as length 0)
2561            let qualified_name_bytes = c.qualified_name.as_deref().unwrap_or_default().as_bytes();
2562            write_counted(
2563                writer,
2564                &(qualified_name_bytes.len() as u32).to_le_bytes(),
2565                &mut bytes_written,
2566            )?;
2567            write_counted(writer, qualified_name_bytes, &mut bytes_written)?;
2568
2569            // Kind (1 byte)
2570            write_counted(writer, &[symbol_kind_to_u8(&c.kind)], &mut bytes_written)?;
2571
2572            // Lines + exported
2573            write_counted(
2574                writer,
2575                &(c.start_line as u32).to_le_bytes(),
2576                &mut bytes_written,
2577            )?;
2578            write_counted(
2579                writer,
2580                &(c.end_line as u32).to_le_bytes(),
2581                &mut bytes_written,
2582            )?;
2583            write_counted(writer, &[c.exported as u8], &mut bytes_written)?;
2584
2585            // Snippet
2586            let snippet_bytes = c.snippet.as_bytes();
2587            write_counted(
2588                writer,
2589                &(snippet_bytes.len() as u32).to_le_bytes(),
2590                &mut bytes_written,
2591            )?;
2592            write_counted(writer, snippet_bytes, &mut bytes_written)?;
2593
2594            // Embed text
2595            let embed_bytes = c.embed_text.as_bytes();
2596            write_counted(
2597                writer,
2598                &(embed_bytes.len() as u32).to_le_bytes(),
2599                &mut bytes_written,
2600            )?;
2601            write_counted(writer, embed_bytes, &mut bytes_written)?;
2602
2603            // Vector (f32 array)
2604            for &val in &entry.vector {
2605                write_counted(writer, &val.to_le_bytes(), &mut bytes_written)?;
2606            }
2607        }
2608
2609        Ok(bytes_written)
2610    }
2611
2612    /// Deserialize the index from bytes
2613    pub fn from_bytes(data: &[u8], current_canonical_root: &Path) -> Result<Self, String> {
2614        debug_assert!(current_canonical_root.is_absolute());
2615        if data.len() < HEADER_BYTES_V1 {
2616            return Err("data too short".to_string());
2617        }
2618
2619        Self::from_reader_after_version(
2620            Cursor::new(&data[1..]),
2621            data[0],
2622            current_canonical_root,
2623            Some(data.len()),
2624            1,
2625        )
2626    }
2627
2628    fn from_reader_after_version<R: Read>(
2629        reader: R,
2630        version: u8,
2631        current_canonical_root: &Path,
2632        total_len: Option<usize>,
2633        bytes_read: usize,
2634    ) -> Result<Self, String> {
2635        debug_assert!(current_canonical_root.is_absolute());
2636        let mut reader = CountingReader::with_bytes_read(reader, bytes_read);
2637
2638        if version != SEMANTIC_INDEX_VERSION_V1
2639            && version != SEMANTIC_INDEX_VERSION_V2
2640            && version != SEMANTIC_INDEX_VERSION_V3
2641            && version != SEMANTIC_INDEX_VERSION_V4
2642            && version != SEMANTIC_INDEX_VERSION_V5
2643            && version != SEMANTIC_INDEX_VERSION_V6
2644            && version != SEMANTIC_INDEX_VERSION_V7
2645        {
2646            return Err(format!("unsupported version: {}", version));
2647        }
2648        // V2 and newer share the same header layout (V3/V4/V5 only differ from
2649        // V2 in the per-mtime entry layout): version(1) + dimension(4) +
2650        // entry_count(4) + fingerprint_len(4) + fingerprint bytes.
2651        if (version == SEMANTIC_INDEX_VERSION_V2
2652            || version == SEMANTIC_INDEX_VERSION_V3
2653            || version == SEMANTIC_INDEX_VERSION_V4
2654            || version == SEMANTIC_INDEX_VERSION_V5
2655            || version == SEMANTIC_INDEX_VERSION_V6
2656            || version == SEMANTIC_INDEX_VERSION_V7)
2657            && total_len.is_some_and(|len| len < HEADER_BYTES_V2)
2658        {
2659            return Err("data too short for semantic index v2/v3/v4/v5/v6/v7 header".to_string());
2660        }
2661
2662        let dimension = read_u32_stream(&mut reader)? as usize;
2663        let entry_count = read_u32_stream(&mut reader)? as usize;
2664        validate_embedding_dimension(dimension)?;
2665        if entry_count > MAX_ENTRIES {
2666            return Err(format!("too many semantic index entries: {}", entry_count));
2667        }
2668
2669        // Fingerprint handling:
2670        //   - V1: no fingerprint field at all.
2671        //   - V2: fingerprint_len + fingerprint bytes; always present (writer
2672        //     only emitted V2 when fingerprint was Some).
2673        //   - V3+: fingerprint_len always present; fingerprint_len==0 ⇒ None.
2674        let has_fingerprint_field = version == SEMANTIC_INDEX_VERSION_V2
2675            || version == SEMANTIC_INDEX_VERSION_V3
2676            || version == SEMANTIC_INDEX_VERSION_V4
2677            || version == SEMANTIC_INDEX_VERSION_V5
2678            || version == SEMANTIC_INDEX_VERSION_V6
2679            || version == SEMANTIC_INDEX_VERSION_V7;
2680        let fingerprint = if has_fingerprint_field {
2681            let fingerprint_len = read_u32_stream(&mut reader)? as usize;
2682            if total_len
2683                .is_some_and(|len| reader.bytes_read().saturating_add(fingerprint_len) > len)
2684            {
2685                return Err("unexpected end of data reading fingerprint".to_string());
2686            }
2687            if fingerprint_len == 0 {
2688                None
2689            } else {
2690                let mut raw = vec![0u8; fingerprint_len];
2691                read_exact_stream(
2692                    &mut reader,
2693                    &mut raw,
2694                    "unexpected end of data reading fingerprint",
2695                )?;
2696                let raw = String::from_utf8_lossy(&raw).to_string();
2697                Some(
2698                    serde_json::from_str::<SemanticIndexFingerprint>(&raw)
2699                        .map_err(|error| format!("invalid semantic fingerprint: {error}"))?,
2700                )
2701            }
2702        } else {
2703            None
2704        };
2705
2706        // File mtimes
2707        let mtime_count = read_u32_stream(&mut reader)? as usize;
2708        if mtime_count > MAX_ENTRIES {
2709            return Err(format!("too many semantic file mtimes: {}", mtime_count));
2710        }
2711
2712        let vector_bytes = entry_count
2713            .checked_mul(dimension)
2714            .and_then(|count| count.checked_mul(F32_BYTES))
2715            .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
2716        if total_len.is_some_and(|len| vector_bytes > len.saturating_sub(reader.bytes_read())) {
2717            return Err("semantic index vectors exceed available data".to_string());
2718        }
2719
2720        let mut file_mtimes = HashMap::with_capacity(mtime_count);
2721        let mut file_sizes = HashMap::with_capacity(mtime_count);
2722        let mut file_hashes = HashMap::with_capacity(mtime_count);
2723        for _ in 0..mtime_count {
2724            let path = read_string_stream(&mut reader, total_len)?;
2725            let secs = read_u64_stream(&mut reader)?;
2726            // V3+ persists subsec_nanos alongside secs so staleness checks
2727            // survive restart round-trips. V1/V2 load with 0 nanos, which
2728            // causes one rebuild on upgrade (they never matched live APFS
2729            // mtimes anyway — the bug v0.15.2 fixes). After that rebuild,
2730            // the cache is persisted as V3 and stabilises.
2731            let nanos = if version == SEMANTIC_INDEX_VERSION_V3
2732                || version == SEMANTIC_INDEX_VERSION_V4
2733                || version == SEMANTIC_INDEX_VERSION_V5
2734                || version == SEMANTIC_INDEX_VERSION_V6
2735                || version == SEMANTIC_INDEX_VERSION_V7
2736            {
2737                read_u32_stream(&mut reader)?
2738            } else {
2739                0
2740            };
2741            let size = if version == SEMANTIC_INDEX_VERSION_V5
2742                || version == SEMANTIC_INDEX_VERSION_V6
2743                || version == SEMANTIC_INDEX_VERSION_V7
2744            {
2745                read_u64_stream(&mut reader)?
2746            } else {
2747                0
2748            };
2749            let content_hash =
2750                if version == SEMANTIC_INDEX_VERSION_V6 || version == SEMANTIC_INDEX_VERSION_V7 {
2751                    let mut hash_bytes = [0u8; 32];
2752                    read_exact_stream(
2753                        &mut reader,
2754                        &mut hash_bytes,
2755                        "unexpected end of data reading content hash",
2756                    )?;
2757                    blake3::Hash::from_bytes(hash_bytes)
2758                } else {
2759                    cache_freshness::zero_hash()
2760                };
2761            // Hardening against corrupt / maliciously crafted cache files
2762            // (v0.15.2). `Duration::new(secs, nanos)` can panic when the
2763            // nanosecond carry overflows the second counter, and
2764            // `SystemTime + Duration` can panic on carry past the platform's
2765            // upper bound. Explicit validation keeps a corrupted semantic.bin
2766            // from taking down the whole aft process.
2767            if nanos >= 1_000_000_000 {
2768                return Err(format!(
2769                    "invalid semantic mtime: nanos {} >= 1_000_000_000",
2770                    nanos
2771                ));
2772            }
2773            let duration = std::time::Duration::new(secs, nanos);
2774            let mtime = SystemTime::UNIX_EPOCH
2775                .checked_add(duration)
2776                .ok_or_else(|| {
2777                    format!(
2778                        "invalid semantic mtime: secs={} nanos={} overflows SystemTime",
2779                        secs, nanos
2780                    )
2781                })?;
2782            let path = if version == SEMANTIC_INDEX_VERSION_V6
2783                || version == SEMANTIC_INDEX_VERSION_V7
2784            {
2785                cached_path_under_root(current_canonical_root, &PathBuf::from(path))
2786                    .ok_or_else(|| "cached semantic mtime path escapes project root".to_string())?
2787            } else {
2788                PathBuf::from(path)
2789            };
2790            file_mtimes.insert(path.clone(), mtime);
2791            file_sizes.insert(path.clone(), size);
2792            file_hashes.insert(path, content_hash);
2793        }
2794
2795        // Entries
2796        let mut entries = Vec::with_capacity(entry_count);
2797        for _ in 0..entry_count {
2798            let raw_file = PathBuf::from(read_string_stream(&mut reader, total_len)?);
2799            let file = if version == SEMANTIC_INDEX_VERSION_V6
2800                || version == SEMANTIC_INDEX_VERSION_V7
2801            {
2802                cached_path_under_root(current_canonical_root, &raw_file)
2803                    .ok_or_else(|| "cached semantic entry path escapes project root".to_string())?
2804            } else {
2805                raw_file
2806            };
2807            let name = read_string_stream(&mut reader, total_len)?;
2808            let qualified_name = if version == SEMANTIC_INDEX_VERSION_V7 {
2809                let qualified_name = read_string_stream(&mut reader, total_len)?;
2810                if qualified_name.is_empty() {
2811                    None
2812                } else {
2813                    Some(qualified_name)
2814                }
2815            } else {
2816                None
2817            };
2818
2819            let kind = u8_to_symbol_kind(read_u8_stream(&mut reader, "unexpected end of data")?);
2820
2821            let start_line = read_u32_stream(&mut reader)?;
2822            let end_line = read_u32_stream(&mut reader)?;
2823
2824            let exported = read_u8_stream(&mut reader, "unexpected end of data")? != 0;
2825
2826            let snippet = read_string_stream(&mut reader, total_len)?;
2827            let embed_text = read_string_stream(&mut reader, total_len)?;
2828
2829            // Vector
2830            let vec_bytes = dimension
2831                .checked_mul(F32_BYTES)
2832                .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
2833            if total_len.is_some_and(|len| reader.bytes_read().saturating_add(vec_bytes) > len) {
2834                return Err("unexpected end of data reading vector".to_string());
2835            }
2836            let mut vector = Vec::with_capacity(dimension);
2837            for _ in 0..dimension {
2838                let mut bytes = [0u8; F32_BYTES];
2839                read_exact_stream(
2840                    &mut reader,
2841                    &mut bytes,
2842                    "unexpected end of data reading vector",
2843                )?;
2844                vector.push(f32::from_le_bytes(bytes));
2845            }
2846
2847            entries.push(EmbeddingEntry {
2848                chunk: SemanticChunk {
2849                    file,
2850                    name,
2851                    qualified_name,
2852                    kind,
2853                    start_line,
2854                    end_line,
2855                    exported,
2856                    embed_text,
2857                    snippet,
2858                },
2859                vector,
2860            });
2861        }
2862
2863        if entries.len() != entry_count {
2864            return Err(format!(
2865                "semantic cache entry count drift: header={} decoded={}",
2866                entry_count,
2867                entries.len()
2868            ));
2869        }
2870        for entry in &entries {
2871            if !file_mtimes.contains_key(&entry.chunk.file) {
2872                return Err(format!(
2873                    "semantic cache metadata missing for entry file {}",
2874                    entry.chunk.file.display()
2875                ));
2876            }
2877        }
2878
2879        Ok(Self {
2880            entries,
2881            file_mtimes,
2882            file_sizes,
2883            file_hashes,
2884            dimension,
2885            fingerprint,
2886            project_root: current_canonical_root.to_path_buf(),
2887            deferred_files: HashSet::new(),
2888        })
2889    }
2890}
2891
2892fn write_counted<W: Write>(
2893    writer: &mut W,
2894    bytes: &[u8],
2895    bytes_written: &mut usize,
2896) -> io::Result<()> {
2897    writer.write_all(bytes)?;
2898    *bytes_written = bytes_written.saturating_add(bytes.len());
2899    Ok(())
2900}
2901
2902struct CountingReader<R> {
2903    inner: R,
2904    bytes_read: usize,
2905}
2906
2907impl<R> CountingReader<R> {
2908    fn with_bytes_read(inner: R, bytes_read: usize) -> Self {
2909        Self { inner, bytes_read }
2910    }
2911
2912    fn bytes_read(&self) -> usize {
2913        self.bytes_read
2914    }
2915}
2916
2917impl<R: Read> Read for CountingReader<R> {
2918    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
2919        let read = self.inner.read(buf)?;
2920        self.bytes_read = self.bytes_read.saturating_add(read);
2921        Ok(read)
2922    }
2923}
2924
2925fn read_exact_stream<R: Read>(
2926    reader: &mut CountingReader<R>,
2927    buf: &mut [u8],
2928    eof_message: &'static str,
2929) -> Result<(), String> {
2930    reader.read_exact(buf).map_err(|error| {
2931        if error.kind() == io::ErrorKind::UnexpectedEof {
2932            eof_message.to_string()
2933        } else {
2934            format!("{eof_message}: {error}")
2935        }
2936    })
2937}
2938
2939fn read_u8_stream<R: Read>(
2940    reader: &mut CountingReader<R>,
2941    eof_message: &'static str,
2942) -> Result<u8, String> {
2943    let mut bytes = [0u8; 1];
2944    read_exact_stream(reader, &mut bytes, eof_message)?;
2945    Ok(bytes[0])
2946}
2947
2948fn read_u32_stream<R: Read>(reader: &mut CountingReader<R>) -> Result<u32, String> {
2949    let mut bytes = [0u8; 4];
2950    read_exact_stream(reader, &mut bytes, "unexpected end of data reading u32")?;
2951    Ok(u32::from_le_bytes(bytes))
2952}
2953
2954fn read_u64_stream<R: Read>(reader: &mut CountingReader<R>) -> Result<u64, String> {
2955    let mut bytes = [0u8; 8];
2956    read_exact_stream(reader, &mut bytes, "unexpected end of data reading u64")?;
2957    Ok(u64::from_le_bytes(bytes))
2958}
2959
2960fn read_string_stream<R: Read>(
2961    reader: &mut CountingReader<R>,
2962    total_len: Option<usize>,
2963) -> Result<String, String> {
2964    let len = read_u32_stream(reader)? as usize;
2965    if total_len.is_some_and(|total_len| reader.bytes_read().saturating_add(len) > total_len) {
2966        return Err("unexpected end of data reading string".to_string());
2967    }
2968    let mut bytes = vec![0u8; len];
2969    read_exact_stream(reader, &mut bytes, "unexpected end of data reading string")?;
2970    Ok(String::from_utf8_lossy(&bytes).to_string())
2971}
2972
2973struct SourceLineCache<'a> {
2974    lines: Vec<&'a str>,
2975    line_starts: Vec<usize>,
2976}
2977
2978impl<'a> SourceLineCache<'a> {
2979    fn new(source: &'a str) -> Self {
2980        let lines: Vec<&'a str> = source.lines().collect();
2981        let mut line_starts = Vec::with_capacity(lines.len());
2982        let bytes = source.as_bytes();
2983        let mut offset = 0usize;
2984        for line in &lines {
2985            line_starts.push(offset);
2986            offset += line.len();
2987            if bytes.get(offset) == Some(&b'\r') && bytes.get(offset + 1) == Some(&b'\n') {
2988                offset += 2;
2989            } else if bytes.get(offset) == Some(&b'\n') {
2990                offset += 1;
2991            }
2992        }
2993        Self { lines, line_starts }
2994    }
2995
2996    fn len(&self) -> usize {
2997        debug_assert_eq!(self.lines.len(), self.line_starts.len());
2998        self.line_starts.len()
2999    }
3000}
3001
3002/// Build enriched embedding text from a symbol with cAST-style context
3003fn build_embed_text_with_lines(
3004    symbol: &Symbol,
3005    line_cache: &SourceLineCache<'_>,
3006    file: &Path,
3007    project_root: &Path,
3008) -> String {
3009    let relative = file
3010        .strip_prefix(project_root)
3011        .unwrap_or(file)
3012        .to_string_lossy();
3013
3014    let kind_label = match &symbol.kind {
3015        SymbolKind::Function => "function",
3016        SymbolKind::Class => "class",
3017        SymbolKind::Method => "method",
3018        SymbolKind::Struct => "struct",
3019        SymbolKind::Interface => "interface",
3020        SymbolKind::Enum => "enum",
3021        SymbolKind::TypeAlias => "type",
3022        SymbolKind::Variable => "variable",
3023        SymbolKind::Heading => "heading",
3024        SymbolKind::FileSummary => "file-summary",
3025    };
3026
3027    // Build: "file:relative/path kind:function name:validateAuth signature:fn validateAuth(token: &str) -> bool"
3028    let name = &symbol.name;
3029    let mut text = format!(
3030        "name:{name} file:{} kind:{} name:{name}",
3031        relative, kind_label
3032    );
3033
3034    if let Some(sig) = &symbol.signature {
3035        // Cap the signature: structured parsers (e.g. YAML/Kubernetes) pack
3036        // entire inline scripts (CronJob/Job `command:` bodies, multi-KB) into
3037        // the signature. Appending it unbounded produces a single embed_text
3038        // that overflows the embedding backend's physical batch (e.g. a
3039        // llama.cpp server's 512-token cap), aborting the whole index build
3040        // and silently degrading every search to lexical. 400 chars keeps the
3041        // identifying head of the signature without blowing the budget.
3042        text.push_str(&format!(" signature:{}", truncate_chars(sig, 400)));
3043    }
3044
3045    // Add body snippet (first ~300 chars of symbol body)
3046    let start = (symbol.range.start_line as usize).min(line_cache.len());
3047    // range.end_line is inclusive 0-based; +1 makes it an exclusive slice bound.
3048    let end = (symbol.range.end_line as usize + 1).min(line_cache.len());
3049    if start < end {
3050        let body: String = line_cache.lines[start..end]
3051            .iter()
3052            .take(15) // max 15 lines
3053            .copied()
3054            .collect::<Vec<&str>>()
3055            .join("\n");
3056        let snippet = if body.len() > 300 {
3057            format!("{}...", &body[..body.floor_char_boundary(300)])
3058        } else {
3059            body
3060        };
3061        text.push_str(&format!(" body:{}", snippet));
3062    }
3063
3064    // Final defense-in-depth clamp: no single embed_text may exceed the
3065    // backend's per-input budget regardless of which field grew. Most
3066    // backends cap a physical batch around 512 tokens; ~1600 chars stays
3067    // comfortably under that for typical English/code (≈4 chars/token).
3068    truncate_chars(&text, MAX_EMBED_TEXT_CHARS)
3069}
3070
3071#[cfg(test)]
3072fn build_embed_text(symbol: &Symbol, source: &str, file: &Path, project_root: &Path) -> String {
3073    let line_cache = SourceLineCache::new(source);
3074    build_embed_text_with_lines(symbol, &line_cache, file, project_root)
3075}
3076
3077/// Upper bound on characters in a single chunk's `embed_text`. Keeps any one
3078/// input below typical embedding-backend physical batch limits (~512 tokens)
3079/// so an oversized symbol cannot abort the whole index build.
3080const MAX_EMBED_TEXT_CHARS: usize = 1600;
3081
3082fn truncate_chars(value: &str, max_chars: usize) -> String {
3083    value.chars().take(max_chars).collect()
3084}
3085
3086fn first_leading_doc_comment(line_cache: &SourceLineCache<'_>) -> String {
3087    let Some((start, first)) = line_cache
3088        .lines
3089        .iter()
3090        .enumerate()
3091        .find(|(_, line)| !line.trim().is_empty())
3092    else {
3093        return String::new();
3094    };
3095
3096    let trimmed = first.trim_start();
3097    if trimmed.starts_with("/**") {
3098        let mut comment = Vec::new();
3099        for line in line_cache.lines.iter().skip(start) {
3100            comment.push(*line);
3101            if line.contains("*/") {
3102                break;
3103            }
3104        }
3105        return truncate_chars(&comment.join("\n"), 200);
3106    }
3107
3108    if trimmed.starts_with("///") || trimmed.starts_with("//!") {
3109        let comment = line_cache
3110            .lines
3111            .iter()
3112            .skip(start)
3113            .take_while(|line| {
3114                let trimmed = line.trim_start();
3115                trimmed.starts_with("///") || trimmed.starts_with("//!")
3116            })
3117            .copied()
3118            .collect::<Vec<_>>()
3119            .join("\n");
3120        return truncate_chars(&comment, 200);
3121    }
3122
3123    String::new()
3124}
3125
3126pub fn build_file_summary_chunk(
3127    file: &Path,
3128    project_root: &Path,
3129    source: &str,
3130    top_exports: &[&str],
3131    top_export_signatures: &[Option<&str>],
3132) -> SemanticChunk {
3133    let line_cache = SourceLineCache::new(source);
3134    build_file_summary_chunk_with_lines(
3135        file,
3136        project_root,
3137        &line_cache,
3138        top_exports,
3139        top_export_signatures,
3140    )
3141}
3142
3143fn build_file_summary_chunk_with_lines(
3144    file: &Path,
3145    project_root: &Path,
3146    line_cache: &SourceLineCache<'_>,
3147    top_exports: &[&str],
3148    top_export_signatures: &[Option<&str>],
3149) -> SemanticChunk {
3150    let relative = file.strip_prefix(project_root).unwrap_or(file);
3151    let rel_path = relative.to_string_lossy();
3152    let parent_dir = relative
3153        .parent()
3154        .map(|parent| parent.to_string_lossy().to_string())
3155        .unwrap_or_default();
3156    let name = file
3157        .file_stem()
3158        .map(|stem| stem.to_string_lossy().to_string())
3159        .unwrap_or_default();
3160    let doc = first_leading_doc_comment(line_cache);
3161    let exports = top_exports
3162        .iter()
3163        .take(5)
3164        .copied()
3165        .collect::<Vec<_>>()
3166        .join(",");
3167    let snippet = if doc.is_empty() {
3168        top_export_signatures
3169            .first()
3170            .and_then(|signature| signature.as_deref())
3171            .map(|signature| truncate_chars(signature, 200))
3172            .unwrap_or_default()
3173    } else {
3174        doc.clone()
3175    };
3176
3177    SemanticChunk {
3178        file: file.to_path_buf(),
3179        name,
3180        qualified_name: None,
3181        kind: SymbolKind::FileSummary,
3182        start_line: 0,
3183        end_line: 0,
3184        exported: false,
3185        embed_text: truncate_chars(
3186            &format!(
3187                "file:{rel_path} kind:file-summary name:{} parent:{parent_dir} doc:{doc} exports:{exports}",
3188                file.file_stem()
3189                    .map(|stem| stem.to_string_lossy().to_string())
3190                    .unwrap_or_default()
3191            ),
3192            MAX_EMBED_TEXT_CHARS,
3193        ),
3194        snippet,
3195    }
3196}
3197
3198fn parser_for(
3199    parsers: &mut HashMap<crate::parser::LangId, Parser>,
3200    lang: crate::parser::LangId,
3201) -> Result<&mut Parser, String> {
3202    use std::collections::hash_map::Entry;
3203
3204    match parsers.entry(lang) {
3205        Entry::Occupied(entry) => Ok(entry.into_mut()),
3206        Entry::Vacant(entry) => {
3207            let grammar = grammar_for(lang);
3208            let mut parser = Parser::new();
3209            parser
3210                .set_language(&grammar)
3211                .map_err(|error| error.to_string())?;
3212            Ok(entry.insert(parser))
3213        }
3214    }
3215}
3216
3217pub fn is_semantic_indexed_extension(path: &Path) -> bool {
3218    matches!(
3219        path.extension().and_then(|extension| extension.to_str()),
3220        Some(
3221            "ts" | "tsx"
3222                | "js"
3223                | "jsx"
3224                | "py"
3225                | "rs"
3226                | "go"
3227                | "c"
3228                | "h"
3229                | "cc"
3230                | "cpp"
3231                | "cxx"
3232                | "hpp"
3233                | "hh"
3234                | "zig"
3235                | "cs"
3236                | "sh"
3237                | "bash"
3238                | "zsh"
3239                | "inc"
3240                | "php"
3241                | "sol"
3242                | "scss"
3243                | "vue"
3244                | "yaml"
3245                | "yml"
3246                | "pas"
3247                | "pp"
3248                | "dpr"
3249                | "dpk"
3250                | "lpr"
3251                | "java"
3252                | "kt"
3253                | "kts"
3254                | "rb"
3255                | "swift"
3256                | "scala"
3257                | "sc"
3258                | "lua"
3259                | "pl"
3260                | "pm"
3261                | "t"
3262                | "r"
3263                | "R",
3264        )
3265    )
3266}
3267
3268fn canonicalize_existing_or_deleted_path(path: &Path) -> PathBuf {
3269    if let Ok(canonical) = fs::canonicalize(path) {
3270        return canonical;
3271    }
3272
3273    let Some(parent) = path.parent() else {
3274        return path.to_path_buf();
3275    };
3276    let Some(file_name) = path.file_name() else {
3277        return path.to_path_buf();
3278    };
3279
3280    fs::canonicalize(parent)
3281        .map(|canonical_parent| canonical_parent.join(file_name))
3282        .unwrap_or_else(|_| path.to_path_buf())
3283}
3284
3285/// Files larger than this are skipped for semantic chunking. The read +
3286/// tree-sitter parse is transiently O(file size) (tree-sitter can use several×
3287/// the source bytes), and `par_iter` collection parses many files at once, so an
3288/// unbounded read here is an OOM vector on a repo with a few multi-MB generated/
3289/// vendored/minified files. A file this large yields almost no useful embedding
3290/// anyway (each chunk's embed_text is clamped to MAX_EMBED_TEXT_CHARS), so we
3291/// track it (0 chunks) instead of reading it — freshness then skips it on later
3292/// refreshes. 4 MiB keeps essentially all hand-written source while capping the
3293/// pathological tail.
3294const MAX_SEMANTIC_FILE_BYTES: u64 = 4 * 1024 * 1024;
3295
3296fn collect_semantic_file(
3297    project_root: &Path,
3298    file: &Path,
3299    parsers: &mut HashMap<crate::parser::LangId, Parser>,
3300) -> Result<(IndexedFileMetadata, Vec<SemanticChunk>), String> {
3301    let metadata = fs::metadata(file).map_err(|error| error.to_string())?;
3302    if !metadata.is_file() {
3303        return Err("not a regular file".to_string());
3304    }
3305    let mtime = metadata.modified().map_err(|error| error.to_string())?;
3306    let size = metadata.len();
3307
3308    if !is_semantic_indexed_extension(file) {
3309        return Err("unsupported file extension".to_string());
3310    }
3311    let lang = detect_language(file).ok_or_else(|| "unsupported file extension".to_string())?;
3312
3313    let mut indexed_metadata = IndexedFileMetadata {
3314        mtime,
3315        size,
3316        content_hash: cache_freshness::zero_hash(),
3317    };
3318
3319    // OOM backstop: skip oversized files before the read + parse (tracked with
3320    // zero chunks by the caller, so freshness won't re-read them every refresh).
3321    if size > MAX_SEMANTIC_FILE_BYTES {
3322        return Ok((indexed_metadata, Vec::new()));
3323    }
3324
3325    let source = fs::read_to_string(file).map_err(|error| error.to_string())?;
3326    indexed_metadata.content_hash = if size <= cache_freshness::CONTENT_HASH_SIZE_CAP {
3327        cache_freshness::hash_bytes(source.as_bytes())
3328    } else {
3329        cache_freshness::zero_hash()
3330    };
3331
3332    let chunks = collect_file_chunks_from_source(project_root, file, lang, parsers, &source)?;
3333    Ok((indexed_metadata, chunks))
3334}
3335
3336#[cfg(test)]
3337fn collect_file_chunks(
3338    project_root: &Path,
3339    file: &Path,
3340    parsers: &mut HashMap<crate::parser::LangId, Parser>,
3341) -> Result<Vec<SemanticChunk>, String> {
3342    if !is_semantic_indexed_extension(file) {
3343        return Err("unsupported file extension".to_string());
3344    }
3345    let lang = detect_language(file).ok_or_else(|| "unsupported file extension".to_string())?;
3346    // OOM backstop: skip oversized files before the read + parse (tracked with
3347    // zero chunks by the caller, so freshness won't re-read them every refresh).
3348    if fs::metadata(file).is_ok_and(|m| m.len() > MAX_SEMANTIC_FILE_BYTES) {
3349        return Ok(Vec::new());
3350    }
3351    let source = fs::read_to_string(file).map_err(|error| error.to_string())?;
3352    collect_file_chunks_from_source(project_root, file, lang, parsers, &source)
3353}
3354
3355fn collect_file_chunks_from_source(
3356    project_root: &Path,
3357    file: &Path,
3358    lang: crate::parser::LangId,
3359    parsers: &mut HashMap<crate::parser::LangId, Parser>,
3360    source: &str,
3361) -> Result<Vec<SemanticChunk>, String> {
3362    let tree = parser_for(parsers, lang)?
3363        .parse(source, None)
3364        .ok_or_else(|| format!("tree-sitter parse returned None for {}", file.display()))?;
3365    let symbols =
3366        extract_symbols_from_tree(source, &tree, lang).map_err(|error| error.to_string())?;
3367
3368    Ok(symbols_to_chunks(file, &symbols, source, project_root))
3369}
3370
3371/// Build a display snippet from a symbol's source
3372fn build_snippet_with_lines(symbol: &Symbol, line_cache: &SourceLineCache<'_>) -> String {
3373    let start = (symbol.range.start_line as usize).min(line_cache.len());
3374    // range.end_line is inclusive 0-based; +1 makes it an exclusive slice bound.
3375    let end = (symbol.range.end_line as usize + 1).min(line_cache.len());
3376    if start < end {
3377        let snippet_lines: Vec<&str> = line_cache.lines[start..end]
3378            .iter()
3379            .take(5)
3380            .copied()
3381            .collect();
3382        let mut snippet = snippet_lines.join("\n");
3383        if end - start > 5 {
3384            snippet.push_str("\n  ...");
3385        }
3386        if snippet.len() > 300 {
3387            snippet = format!("{}...", &snippet[..snippet.floor_char_boundary(300)]);
3388        }
3389        snippet
3390    } else {
3391        String::new()
3392    }
3393}
3394
3395#[cfg(test)]
3396fn build_snippet(symbol: &Symbol, source: &str) -> String {
3397    let line_cache = SourceLineCache::new(source);
3398    build_snippet_with_lines(symbol, &line_cache)
3399}
3400
3401fn qualified_name_for_symbol(symbol: &Symbol) -> Option<String> {
3402    let mut parts = symbol
3403        .scope_chain
3404        .iter()
3405        .filter(|part| !part.is_empty())
3406        .cloned()
3407        .collect::<Vec<_>>();
3408    if !symbol.name.is_empty() {
3409        parts.push(symbol.name.clone());
3410    }
3411    (!parts.is_empty()).then(|| parts.join("."))
3412}
3413
3414/// Convert symbols to semantic chunks with enriched context
3415fn symbols_to_chunks(
3416    file: &Path,
3417    symbols: &[Symbol],
3418    source: &str,
3419    project_root: &Path,
3420) -> Vec<SemanticChunk> {
3421    let line_cache = SourceLineCache::new(source);
3422    let mut chunks = Vec::new();
3423    let top_exports_with_signatures = symbols
3424        .iter()
3425        .filter(|symbol| {
3426            symbol.exported
3427                && symbol.parent.is_none()
3428                && !matches!(symbol.kind, SymbolKind::Heading)
3429        })
3430        .map(|symbol| (symbol.name.as_str(), symbol.signature.as_deref()))
3431        .collect::<Vec<_>>();
3432
3433    let has_only_headings = !symbols.is_empty()
3434        && symbols
3435            .iter()
3436            .all(|symbol| matches!(symbol.kind, SymbolKind::Heading));
3437    if top_exports_with_signatures.len() <= 2 && !has_only_headings {
3438        let top_exports = top_exports_with_signatures
3439            .iter()
3440            .map(|(name, _)| *name)
3441            .collect::<Vec<_>>();
3442        let top_export_signatures = top_exports_with_signatures
3443            .iter()
3444            .map(|(_, signature)| *signature)
3445            .collect::<Vec<_>>();
3446        chunks.push(build_file_summary_chunk_with_lines(
3447            file,
3448            project_root,
3449            &line_cache,
3450            &top_exports,
3451            &top_export_signatures,
3452        ));
3453    }
3454
3455    for symbol in symbols {
3456        // Skip Markdown / HTML heading chunks: empirically they dominate result
3457        // lists even for code-shaped queries because heading prose embeds well.
3458        // Agents querying for code lose the actual matches under doc noise.
3459        // README/docs queries are still served by grep on the same files.
3460        if matches!(symbol.kind, SymbolKind::Heading) {
3461            continue;
3462        }
3463
3464        // Skip very small symbols (single-line variables, etc.)
3465        let line_count = symbol
3466            .range
3467            .end_line
3468            .saturating_sub(symbol.range.start_line)
3469            + 1;
3470        if line_count < 2 && !matches!(symbol.kind, SymbolKind::Variable) {
3471            continue;
3472        }
3473
3474        let embed_text = build_embed_text_with_lines(symbol, &line_cache, file, project_root);
3475        let snippet = build_snippet_with_lines(symbol, &line_cache);
3476
3477        chunks.push(SemanticChunk {
3478            file: file.to_path_buf(),
3479            name: symbol.name.clone(),
3480            qualified_name: qualified_name_for_symbol(symbol),
3481            kind: symbol.kind.clone(),
3482            start_line: symbol.range.start_line,
3483            end_line: symbol.range.end_line,
3484            exported: symbol.exported,
3485            embed_text,
3486            snippet,
3487        });
3488
3489        // Note: Nested symbols are handled separately by the outline system
3490        // Each symbol is indexed individually
3491    }
3492
3493    chunks
3494}
3495
3496fn semantic_score_order(a: &(f32, usize), b: &(f32, usize)) -> std::cmp::Ordering {
3497    b.0.partial_cmp(&a.0)
3498        .unwrap_or(std::cmp::Ordering::Equal)
3499        .then_with(|| a.1.cmp(&b.1))
3500}
3501
3502/// Cosine similarity between two vectors
3503fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
3504    if a.len() != b.len() {
3505        return 0.0;
3506    }
3507
3508    let mut dot = 0.0f32;
3509    let mut norm_a = 0.0f32;
3510    let mut norm_b = 0.0f32;
3511
3512    for i in 0..a.len() {
3513        dot += a[i] * b[i];
3514        norm_a += a[i] * a[i];
3515        norm_b += b[i] * b[i];
3516    }
3517
3518    let denom = norm_a.sqrt() * norm_b.sqrt();
3519    if denom == 0.0 {
3520        0.0
3521    } else {
3522        dot / denom
3523    }
3524}
3525
3526// Serialization helpers
3527fn symbol_kind_to_u8(kind: &SymbolKind) -> u8 {
3528    match kind {
3529        SymbolKind::Function => 0,
3530        SymbolKind::Class => 1,
3531        SymbolKind::Method => 2,
3532        SymbolKind::Struct => 3,
3533        SymbolKind::Interface => 4,
3534        SymbolKind::Enum => 5,
3535        SymbolKind::TypeAlias => 6,
3536        SymbolKind::Variable => 7,
3537        SymbolKind::Heading => 8,
3538        SymbolKind::FileSummary => 9,
3539    }
3540}
3541
3542fn u8_to_symbol_kind(v: u8) -> SymbolKind {
3543    match v {
3544        0 => SymbolKind::Function,
3545        1 => SymbolKind::Class,
3546        2 => SymbolKind::Method,
3547        3 => SymbolKind::Struct,
3548        4 => SymbolKind::Interface,
3549        5 => SymbolKind::Enum,
3550        6 => SymbolKind::TypeAlias,
3551        7 => SymbolKind::Variable,
3552        8 => SymbolKind::Heading,
3553        9 => SymbolKind::FileSummary,
3554        _ => SymbolKind::Heading,
3555    }
3556}
3557
3558#[cfg(test)]
3559mod tests {
3560    use super::*;
3561    use crate::config::{SemanticBackend, SemanticBackendConfig};
3562    use crate::parser::FileParser;
3563    use std::io::{Read, Write};
3564    use std::net::TcpListener;
3565    use std::thread;
3566
3567    #[test]
3568    fn semantic_index_includes_php_inc_and_scss_extensions() {
3569        for file in ["partial.inc", "index.php", "styles.scss"] {
3570            assert!(
3571                is_semantic_indexed_extension(Path::new(file)),
3572                "{file} should be semantic-index eligible"
3573            );
3574        }
3575    }
3576
3577    #[test]
3578    fn transient_marker_round_trips_and_classifies() {
3579        // A marked transient error is recognized and the marker is stripped for
3580        // display, leaving a clean message.
3581        let marked = format!("{TRANSIENT_EMBEDDING_MARKER}openai compatible request failed: error sending request for url (http://localhost:1234/v1/embeddings)");
3582        assert!(embedding_failure_is_transient(&marked));
3583        let clean = strip_transient_embedding_marker(&marked);
3584        assert!(!clean.contains(TRANSIENT_EMBEDDING_MARKER));
3585        assert!(clean.starts_with("openai compatible request failed:"));
3586
3587        // Permanent errors (HTTP 4xx, dimension mismatch) carry no marker and
3588        // are not classified transient — they must fail fast.
3589        for permanent in [
3590            "openai compatible request failed (HTTP 401): Unauthorized",
3591            "embedding dimension mismatch: index has 384, model returned 768",
3592            "too many files (>20000) for semantic indexing (max 20000)",
3593        ] {
3594            assert!(
3595                !embedding_failure_is_transient(permanent),
3596                "{permanent:?} must not be transient"
3597            );
3598            // Stripping a marker-free string is a no-op.
3599            assert_eq!(strip_transient_embedding_marker(permanent), permanent);
3600        }
3601    }
3602
3603    #[test]
3604    fn send_error_transience_separates_connect_timeout_from_4xx() {
3605        // 5xx / 429 are transient; other client errors are not.
3606        assert!(is_retryable_embedding_status(
3607            reqwest::StatusCode::INTERNAL_SERVER_ERROR
3608        ));
3609        assert!(is_retryable_embedding_status(
3610            reqwest::StatusCode::TOO_MANY_REQUESTS
3611        ));
3612        assert!(!is_retryable_embedding_status(
3613            reqwest::StatusCode::UNAUTHORIZED
3614        ));
3615        assert!(!is_retryable_embedding_status(
3616            reqwest::StatusCode::BAD_REQUEST
3617        ));
3618    }
3619
3620    #[test]
3621    fn local_backend_model_loading_body_is_transient() {
3622        // LM Studio / Ollama return a 4xx with a loading/unloaded message while
3623        // the model swaps; these must classify transient so the build self-heals.
3624        for body in [
3625            r#"{"error":"Model was unloaded while the request was still in queue.."}"#,
3626            r#"{"error":"model is loading, please wait"}"#,
3627            r#"{"error":"Model not loaded"}"#,
3628            "Loading model into memory",
3629        ] {
3630            assert!(
3631                embedding_response_body_is_transient(reqwest::StatusCode::BAD_REQUEST, body),
3632                "{body:?} should be body-transient"
3633            );
3634        }
3635
3636        // A genuine 4xx misconfiguration body must NOT be treated as transient,
3637        // even when it happens to contain generic words from the old broad
3638        // substring matcher.
3639        for body in [
3640            r#"{"error":"invalid api key"}"#,
3641            r#"{"error":"model 'foo' not found"}"#,
3642            "Bad Request: unknown field",
3643            "Bad Request: invalid loading model option",
3644            r#"{"error":"unauthorized while model is being loaded by another account"}"#,
3645        ] {
3646            assert!(
3647                !embedding_response_body_is_transient(reqwest::StatusCode::BAD_REQUEST, body),
3648                "{body:?} must not be body-transient"
3649            );
3650        }
3651
3652        assert!(
3653            !embedding_response_body_is_transient(
3654                reqwest::StatusCode::UNAUTHORIZED,
3655                r#"{"error":"model is loading, please wait"}"#
3656            ),
3657            "permanent auth failures must not become transient because of body text"
3658        );
3659    }
3660
3661    fn start_mock_http_server<F>(handler: F) -> (String, thread::JoinHandle<()>)
3662    where
3663        F: Fn(String, String, String) -> String + Send + 'static,
3664    {
3665        let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
3666        let addr = listener.local_addr().expect("local addr");
3667        let handle = thread::spawn(move || {
3668            let (mut stream, _) = listener.accept().expect("accept request");
3669            let mut buf = Vec::new();
3670            let mut chunk = [0u8; 4096];
3671            let mut header_end = None;
3672            let mut content_length = 0usize;
3673            loop {
3674                let n = stream.read(&mut chunk).expect("read request");
3675                if n == 0 {
3676                    break;
3677                }
3678                buf.extend_from_slice(&chunk[..n]);
3679                if header_end.is_none() {
3680                    if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
3681                        header_end = Some(pos + 4);
3682                        let headers = String::from_utf8_lossy(&buf[..pos + 4]);
3683                        for line in headers.lines() {
3684                            if let Some(value) = line.strip_prefix("Content-Length:") {
3685                                content_length = value.trim().parse::<usize>().unwrap_or(0);
3686                            }
3687                        }
3688                    }
3689                }
3690                if let Some(end) = header_end {
3691                    if buf.len() >= end + content_length {
3692                        break;
3693                    }
3694                }
3695            }
3696
3697            let end = header_end.expect("header terminator");
3698            let request = String::from_utf8_lossy(&buf[..end]).to_string();
3699            let body = String::from_utf8_lossy(&buf[end..end + content_length]).to_string();
3700            let mut lines = request.lines();
3701            let request_line = lines.next().expect("request line").to_string();
3702            let path = request_line
3703                .split_whitespace()
3704                .nth(1)
3705                .expect("request path")
3706                .to_string();
3707            let response_body = handler(request_line, path, body);
3708            let response = format!(
3709                "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
3710                response_body.len(),
3711                response_body
3712            );
3713            stream
3714                .write_all(response.as_bytes())
3715                .expect("write response");
3716        });
3717
3718        (format!("http://{}", addr), handle)
3719    }
3720
3721    fn start_truncated_body_server(attempts: usize) -> (String, thread::JoinHandle<()>) {
3722        let listener = TcpListener::bind("127.0.0.1:0").expect("bind truncated test server");
3723        listener
3724            .set_nonblocking(true)
3725            .expect("nonblocking listener");
3726        let addr = listener.local_addr().expect("local addr");
3727        let handle = thread::spawn(move || {
3728            let deadline = std::time::Instant::now() + Duration::from_secs(2);
3729            let mut accepted = 0usize;
3730            while accepted < attempts && std::time::Instant::now() < deadline {
3731                match listener.accept() {
3732                    Ok((mut stream, _)) => {
3733                        accepted += 1;
3734                        let mut buf = [0u8; 4096];
3735                        // The client (under test) uses a 250ms timeout and drops
3736                        // the connection when the truncated body never completes.
3737                        // On Windows that disconnect surfaces as a hard socket
3738                        // error (WSAECONNRESET) on these read/write calls, where
3739                        // Unix returns a clean EOF. Tolerate both: the mock does
3740                        // not need the request bytes, and a write to an
3741                        // already-hung-up client is expected.
3742                        let _ = stream.read(&mut buf);
3743                        let response = "HTTP/1.1 200 OK
3744Content-Type: application/json
3745Content-Length: 128
3746Connection: close
3747
3748{";
3749                        let _ = stream.write_all(response.as_bytes());
3750                    }
3751                    Err(error) if error.kind() == std::io::ErrorKind::WouldBlock => {
3752                        thread::sleep(Duration::from_millis(10));
3753                    }
3754                    Err(error) => panic!("accept request: {error}"),
3755                }
3756            }
3757        });
3758
3759        (format!("http://{}", addr), handle)
3760    }
3761
3762    #[test]
3763    fn response_body_read_failures_are_marked_transient() {
3764        let (url, handle) = start_truncated_body_server(EMBEDDING_REQUEST_MAX_ATTEMPTS);
3765        let client = Client::builder()
3766            .timeout(Duration::from_millis(250))
3767            .build()
3768            .expect("client");
3769
3770        let error = send_embedding_request(|| client.post(&url).body("{}"), "test backend")
3771            .expect_err("truncated body should fail");
3772
3773        handle.join().unwrap();
3774        assert!(
3775            embedding_failure_is_transient(&error),
3776            "body read failures should be transient-marked: {error}"
3777        );
3778        assert!(error.contains("response read failed"));
3779    }
3780
3781    fn test_vector_for_texts(texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
3782        Ok(texts.iter().map(|_| vec![1.0, 0.0, 0.0]).collect())
3783    }
3784
3785    fn write_rust_file(path: &Path, function_name: &str) {
3786        fs::write(
3787            path,
3788            format!("pub fn {function_name}() -> bool {{\n    true\n}}\n"),
3789        )
3790        .unwrap();
3791    }
3792
3793    fn build_test_index(project_root: &Path, files: &[PathBuf]) -> SemanticIndex {
3794        let mut embed = test_vector_for_texts;
3795        SemanticIndex::build(project_root, files, &mut embed, 8).unwrap()
3796    }
3797
3798    fn test_project_root() -> PathBuf {
3799        std::env::current_dir().unwrap()
3800    }
3801
3802    fn set_file_metadata(index: &mut SemanticIndex, file: &Path, mtime: SystemTime, size: u64) {
3803        index.file_mtimes.insert(file.to_path_buf(), mtime);
3804        index.file_sizes.insert(file.to_path_buf(), size);
3805        index
3806            .file_hashes
3807            .insert(file.to_path_buf(), cache_freshness::zero_hash());
3808    }
3809
3810    fn legacy_semantic_index_bytes(index: &SemanticIndex) -> Vec<u8> {
3811        let mut buf = Vec::new();
3812        let fingerprint_bytes = index.fingerprint.as_ref().and_then(|fingerprint| {
3813            let encoded = fingerprint.as_string();
3814            if encoded.is_empty() {
3815                None
3816            } else {
3817                Some(encoded.into_bytes())
3818            }
3819        });
3820        let file_mtimes: Vec<_> = index
3821            .file_mtimes
3822            .iter()
3823            .filter_map(|(path, mtime)| {
3824                cache_relative_path(&index.project_root, path)
3825                    .map(|relative| (relative, path, mtime))
3826            })
3827            .collect();
3828        let entries: Vec<_> = index
3829            .entries
3830            .iter()
3831            .filter_map(|entry| {
3832                cache_relative_path(&index.project_root, &entry.chunk.file)
3833                    .map(|relative| (relative, entry))
3834            })
3835            .collect();
3836
3837        buf.push(SEMANTIC_INDEX_VERSION_V6);
3838        buf.extend_from_slice(&(index.dimension as u32).to_le_bytes());
3839        buf.extend_from_slice(&(entries.len() as u32).to_le_bytes());
3840        let fp_bytes_ref: &[u8] = fingerprint_bytes.as_deref().unwrap_or(&[]);
3841        buf.extend_from_slice(&(fp_bytes_ref.len() as u32).to_le_bytes());
3842        buf.extend_from_slice(fp_bytes_ref);
3843
3844        buf.extend_from_slice(&(file_mtimes.len() as u32).to_le_bytes());
3845        for (relative, path, mtime) in &file_mtimes {
3846            let path_bytes = relative.to_string_lossy().as_bytes().to_vec();
3847            buf.extend_from_slice(&(path_bytes.len() as u32).to_le_bytes());
3848            buf.extend_from_slice(&path_bytes);
3849            let duration = mtime
3850                .duration_since(SystemTime::UNIX_EPOCH)
3851                .unwrap_or_default();
3852            buf.extend_from_slice(&duration.as_secs().to_le_bytes());
3853            buf.extend_from_slice(&duration.subsec_nanos().to_le_bytes());
3854            let size = index.file_sizes.get(*path).copied().unwrap_or_default();
3855            buf.extend_from_slice(&size.to_le_bytes());
3856            let hash = index
3857                .file_hashes
3858                .get(*path)
3859                .copied()
3860                .unwrap_or_else(cache_freshness::zero_hash);
3861            buf.extend_from_slice(hash.as_bytes());
3862        }
3863
3864        for (relative, entry) in &entries {
3865            let c = &entry.chunk;
3866            let file_bytes = relative.to_string_lossy().as_bytes().to_vec();
3867            buf.extend_from_slice(&(file_bytes.len() as u32).to_le_bytes());
3868            buf.extend_from_slice(&file_bytes);
3869
3870            let name_bytes = c.name.as_bytes();
3871            buf.extend_from_slice(&(name_bytes.len() as u32).to_le_bytes());
3872            buf.extend_from_slice(name_bytes);
3873
3874            buf.push(symbol_kind_to_u8(&c.kind));
3875            buf.extend_from_slice(&(c.start_line as u32).to_le_bytes());
3876            buf.extend_from_slice(&(c.end_line as u32).to_le_bytes());
3877            buf.push(c.exported as u8);
3878
3879            let snippet_bytes = c.snippet.as_bytes();
3880            buf.extend_from_slice(&(snippet_bytes.len() as u32).to_le_bytes());
3881            buf.extend_from_slice(snippet_bytes);
3882
3883            let embed_bytes = c.embed_text.as_bytes();
3884            buf.extend_from_slice(&(embed_bytes.len() as u32).to_le_bytes());
3885            buf.extend_from_slice(embed_bytes);
3886
3887            for &val in &entry.vector {
3888                buf.extend_from_slice(&val.to_le_bytes());
3889            }
3890        }
3891
3892        buf
3893    }
3894
3895    #[derive(Default)]
3896    struct RecordingEmbedder {
3897        calls: Vec<Vec<String>>,
3898    }
3899
3900    impl RecordingEmbedder {
3901        fn embed(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
3902            let vectors = texts
3903                .iter()
3904                .map(|text| deterministic_test_vector(text))
3905                .collect();
3906            self.calls.push(texts);
3907            Ok(vectors)
3908        }
3909
3910        fn total_embedded_texts(&self) -> usize {
3911            self.calls.iter().map(Vec::len).sum()
3912        }
3913
3914        fn embedded_texts(&self) -> Vec<&str> {
3915            self.calls
3916                .iter()
3917                .flat_map(|batch| batch.iter().map(String::as_str))
3918                .collect()
3919        }
3920    }
3921
3922    fn deterministic_test_vector(text: &str) -> Vec<f32> {
3923        let hash = blake3::hash(text.as_bytes());
3924        let bytes = hash.as_bytes();
3925        vec![
3926            1.0,
3927            bytes[0] as f32 / 255.0,
3928            bytes[1] as f32 / 255.0,
3929            bytes[2] as f32 / 255.0,
3930        ]
3931    }
3932
3933    fn build_recorded_test_index(project_root: &Path, files: &[PathBuf]) -> SemanticIndex {
3934        let mut embedder = RecordingEmbedder::default();
3935        let mut embed = |texts: Vec<String>| embedder.embed(texts);
3936        SemanticIndex::build(project_root, files, &mut embed, 16).unwrap()
3937    }
3938
3939    fn force_stale(index: &mut SemanticIndex, file: &Path) {
3940        set_file_metadata(index, file, SystemTime::UNIX_EPOCH, 0);
3941    }
3942
3943    fn write_source(path: &Path, source: &str) {
3944        if let Some(parent) = path.parent() {
3945            fs::create_dir_all(parent).unwrap();
3946        }
3947        fs::write(path, source).unwrap();
3948    }
3949
3950    fn entries_for_file<'a>(index: &'a SemanticIndex, file: &Path) -> Vec<&'a EmbeddingEntry> {
3951        index
3952            .entries
3953            .iter()
3954            .filter(|entry| entry.chunk.file == file)
3955            .collect()
3956    }
3957
3958    fn entry_by_name<'a>(index: &'a SemanticIndex, file: &Path, name: &str) -> &'a EmbeddingEntry {
3959        index
3960            .entries
3961            .iter()
3962            .find(|entry| entry.chunk.file == file && entry.chunk.name == name)
3963            .unwrap_or_else(|| panic!("missing semantic entry {name} in {}", file.display()))
3964    }
3965
3966    fn file_summary_entry<'a>(index: &'a SemanticIndex, file: &Path) -> &'a EmbeddingEntry {
3967        index
3968            .entries
3969            .iter()
3970            .find(|entry| entry.chunk.file == file && entry.chunk.kind == SymbolKind::FileSummary)
3971            .unwrap_or_else(|| panic!("missing file-summary entry in {}", file.display()))
3972    }
3973
3974    #[test]
3975    fn refresh_stale_line_shift_reuses_all_chunks_and_retains_entries() {
3976        let temp = tempfile::tempdir().unwrap();
3977        let project_root = temp.path();
3978        let file = project_root.join("src/lib.rs");
3979        let original = "pub fn alpha() -> i32 {\n    1\n}\n\npub fn beta() -> i32 {\n    2\n}\n";
3980        write_source(&file, original);
3981
3982        let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
3983        let original_entry_count = index.entries.len();
3984        let original_alpha_vector = entry_by_name(&index, &file, "alpha").vector.clone();
3985
3986        write_source(&file, &format!("\n{original}"));
3987        force_stale(&mut index, &file);
3988
3989        let mut embedder = RecordingEmbedder::default();
3990        let mut embed = |texts: Vec<String>| embedder.embed(texts);
3991        let mut progress = |_done: usize, _total: usize| {};
3992        let summary = index
3993            .refresh_stale_files(
3994                project_root,
3995                std::slice::from_ref(&file),
3996                &mut embed,
3997                16,
3998                &mut progress,
3999            )
4000            .unwrap();
4001
4002        assert_eq!(summary.changed, 1);
4003        assert_eq!(embedder.total_embedded_texts(), 0);
4004        assert_eq!(index.entries.len(), original_entry_count);
4005        let shifted_alpha = entry_by_name(&index, &file, "alpha");
4006        assert_eq!(shifted_alpha.chunk.start_line, 1);
4007        assert_eq!(shifted_alpha.vector, original_alpha_vector);
4008    }
4009
4010    #[test]
4011    fn refresh_invalidated_line_shift_emits_full_replacement_delta_for_apply() {
4012        let temp = tempfile::tempdir().unwrap();
4013        let project_root = temp.path();
4014        let file = project_root.join("src/lib.rs");
4015        let original = "pub fn alpha() -> i32 {\n    1\n}\n\npub fn beta() -> i32 {\n    2\n}\n";
4016        write_source(&file, original);
4017
4018        let mut worker_index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
4019        let mut serving_index = worker_index.clone();
4020        let original_entry_count = worker_index.entries.len();
4021
4022        write_source(&file, &format!("\n{original}"));
4023
4024        let mut embedder = RecordingEmbedder::default();
4025        let mut embed = |texts: Vec<String>| embedder.embed(texts);
4026        let mut progress = |_done: usize, _total: usize| {};
4027        let update = worker_index
4028            .refresh_invalidated_files(
4029                project_root,
4030                std::slice::from_ref(&file),
4031                &mut embed,
4032                16,
4033                100,
4034                &mut progress,
4035            )
4036            .unwrap();
4037
4038        assert_eq!(embedder.total_embedded_texts(), 0);
4039        assert_eq!(update.added_entries.len(), original_entry_count);
4040        assert_eq!(worker_index.entries.len(), original_entry_count);
4041
4042        serving_index.apply_refresh_update(
4043            update.added_entries,
4044            update.updated_metadata,
4045            &update.completed_paths,
4046        );
4047
4048        assert_eq!(serving_index.entries.len(), original_entry_count);
4049        assert_eq!(
4050            entries_for_file(&serving_index, &file).len(),
4051            original_entry_count
4052        );
4053        assert_eq!(
4054            entry_by_name(&serving_index, &file, "alpha")
4055                .chunk
4056                .start_line,
4057            1
4058        );
4059    }
4060
4061    #[test]
4062    fn refresh_invalidated_one_symbol_edit_embeds_only_changed_symbol() {
4063        let temp = tempfile::tempdir().unwrap();
4064        let project_root = temp.path();
4065        let file = project_root.join("src/lib.rs");
4066        write_source(
4067            &file,
4068            "pub fn alpha() -> i32 {\n    1\n}\n\npub fn beta() -> i32 {\n    2\n}\n",
4069        );
4070
4071        let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
4072        let original_entry_count = index.entries.len();
4073        let beta_vector = entry_by_name(&index, &file, "beta").vector.clone();
4074
4075        write_source(
4076            &file,
4077            "pub fn alpha() -> i32 {\n    10\n}\n\npub fn beta() -> i32 {\n    2\n}\n",
4078        );
4079
4080        let mut embedder = RecordingEmbedder::default();
4081        let mut embed = |texts: Vec<String>| embedder.embed(texts);
4082        let mut progress = |_done: usize, _total: usize| {};
4083        let update = index
4084            .refresh_invalidated_files(
4085                project_root,
4086                std::slice::from_ref(&file),
4087                &mut embed,
4088                16,
4089                100,
4090                &mut progress,
4091            )
4092            .unwrap();
4093
4094        assert_eq!(embedder.total_embedded_texts(), 1);
4095        assert!(embedder.embedded_texts()[0].contains("name:alpha"));
4096        assert_eq!(update.added_entries.len(), original_entry_count);
4097        assert_eq!(entry_by_name(&index, &file, "beta").vector, beta_vector);
4098    }
4099
4100    #[test]
4101    fn refresh_reuses_one_old_vector_for_two_byte_identical_symbols() {
4102        let temp = tempfile::tempdir().unwrap();
4103        let project_root = temp.path();
4104        let file = project_root.join("src/dupe.js");
4105        let one_duplicate = "function duplicate() {\n  return 1;\n}\n";
4106        write_source(&file, one_duplicate);
4107
4108        let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
4109        let original_vector = entry_by_name(&index, &file, "duplicate").vector.clone();
4110
4111        write_source(&file, &format!("{one_duplicate}\n{one_duplicate}"));
4112
4113        let mut embedder = RecordingEmbedder::default();
4114        let mut embed = |texts: Vec<String>| embedder.embed(texts);
4115        let mut progress = |_done: usize, _total: usize| {};
4116        index
4117            .refresh_invalidated_files(
4118                project_root,
4119                std::slice::from_ref(&file),
4120                &mut embed,
4121                16,
4122                100,
4123                &mut progress,
4124            )
4125            .unwrap();
4126
4127        let duplicate_entries = index
4128            .entries
4129            .iter()
4130            .filter(|entry| entry.chunk.file == file && entry.chunk.name == "duplicate")
4131            .collect::<Vec<_>>();
4132        assert_eq!(duplicate_entries.len(), 2);
4133        assert_eq!(embedder.total_embedded_texts(), 0);
4134        assert_eq!(duplicate_entries[0].vector, original_vector);
4135        assert_eq!(duplicate_entries[1].vector, original_vector);
4136    }
4137
4138    #[test]
4139    fn file_summary_reuses_on_body_edit_and_misses_on_leading_doc_edit() {
4140        let temp = tempfile::tempdir().unwrap();
4141        let project_root = temp.path();
4142        let file = project_root.join("src/lib.rs");
4143        write_source(
4144            &file,
4145            "//! module docs v1\n\npub fn alpha() -> i32 {\n    1\n}\n",
4146        );
4147
4148        let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
4149        let summary_before = file_summary_entry(&index, &file).vector.clone();
4150
4151        write_source(
4152            &file,
4153            "//! module docs v1\n\npub fn alpha() -> i32 {\n    2\n}\n",
4154        );
4155        let mut body_embedder = RecordingEmbedder::default();
4156        let mut body_embed = |texts: Vec<String>| body_embedder.embed(texts);
4157        let mut progress = |_done: usize, _total: usize| {};
4158        index
4159            .refresh_invalidated_files(
4160                project_root,
4161                std::slice::from_ref(&file),
4162                &mut body_embed,
4163                16,
4164                100,
4165                &mut progress,
4166            )
4167            .unwrap();
4168        assert_eq!(body_embedder.total_embedded_texts(), 1);
4169        assert!(body_embedder.embedded_texts()[0].contains("name:alpha"));
4170        assert_eq!(file_summary_entry(&index, &file).vector, summary_before);
4171
4172        write_source(
4173            &file,
4174            "//! module docs v2\n\npub fn alpha() -> i32 {\n    2\n}\n",
4175        );
4176        let mut doc_embedder = RecordingEmbedder::default();
4177        let mut doc_embed = |texts: Vec<String>| doc_embedder.embed(texts);
4178        index
4179            .refresh_invalidated_files(
4180                project_root,
4181                std::slice::from_ref(&file),
4182                &mut doc_embed,
4183                16,
4184                100,
4185                &mut progress,
4186            )
4187            .unwrap();
4188
4189        assert_eq!(doc_embedder.total_embedded_texts(), 1);
4190        assert!(doc_embedder.embedded_texts()[0].contains("kind:file-summary"));
4191        assert_ne!(file_summary_entry(&index, &file).vector, summary_before);
4192    }
4193
4194    #[test]
4195    fn refresh_invalidated_deleted_file_drops_entries_without_embedding() {
4196        let temp = tempfile::tempdir().unwrap();
4197        let project_root = temp.path();
4198        let file = project_root.join("src/lib.rs");
4199        write_source(&file, "pub fn alpha() -> i32 {\n    1\n}\n");
4200
4201        let mut worker_index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
4202        let mut serving_index = worker_index.clone();
4203        fs::remove_file(&file).unwrap();
4204
4205        let mut embedder = RecordingEmbedder::default();
4206        let mut embed = |texts: Vec<String>| embedder.embed(texts);
4207        let mut progress = |_done: usize, _total: usize| {};
4208        let update = worker_index
4209            .refresh_invalidated_files(
4210                project_root,
4211                std::slice::from_ref(&file),
4212                &mut embed,
4213                16,
4214                100,
4215                &mut progress,
4216            )
4217            .unwrap();
4218
4219        assert_eq!(update.summary.deleted, 1);
4220        assert_eq!(embedder.total_embedded_texts(), 0);
4221        assert!(worker_index.entries.is_empty());
4222
4223        serving_index.apply_refresh_update(
4224            update.added_entries,
4225            update.updated_metadata,
4226            &update.completed_paths,
4227        );
4228        assert!(serving_index.entries.is_empty());
4229    }
4230
4231    #[test]
4232    fn watcher_collect_failure_does_not_resurrect_stale_entries() {
4233        let temp = tempfile::tempdir().unwrap();
4234        let project_root = temp.path();
4235        let file = project_root.join("src/lib.rs");
4236        write_source(&file, "pub fn alpha() -> i32 {\n    1\n}\n");
4237
4238        let mut worker_index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
4239        let mut serving_index = worker_index.clone();
4240        fs::write(&file, [0xff, 0xfe, 0xfd]).unwrap();
4241
4242        let mut embedder = RecordingEmbedder::default();
4243        let mut embed = |texts: Vec<String>| embedder.embed(texts);
4244        let mut progress = |_done: usize, _total: usize| {};
4245        let update = worker_index
4246            .refresh_invalidated_files(
4247                project_root,
4248                std::slice::from_ref(&file),
4249                &mut embed,
4250                16,
4251                100,
4252                &mut progress,
4253            )
4254            .unwrap();
4255
4256        assert_eq!(embedder.total_embedded_texts(), 0);
4257        assert!(update.added_entries.is_empty());
4258        assert!(worker_index.entries.is_empty());
4259        assert!(!worker_index.file_mtimes.contains_key(&file));
4260
4261        serving_index.apply_refresh_update(
4262            update.added_entries,
4263            update.updated_metadata,
4264            &update.completed_paths,
4265        );
4266        assert!(serving_index.entries.is_empty());
4267        assert!(!serving_index.file_mtimes.contains_key(&file));
4268    }
4269
4270    #[test]
4271    fn refresh_invalidated_cap_deferral_remains_file_count_based() {
4272        let temp = tempfile::tempdir().unwrap();
4273        let project_root = temp.path();
4274        let indexed = project_root.join("src/a.rs");
4275        let deferred = project_root.join("src/b.rs");
4276        write_source(&indexed, "pub fn alpha() -> i32 {\n    1\n}\n");
4277        write_source(&deferred, "pub fn beta() -> i32 {\n    2\n}\n");
4278
4279        let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&indexed));
4280        let mut embedder = RecordingEmbedder::default();
4281        let mut embed = |texts: Vec<String>| embedder.embed(texts);
4282        let mut progress = |_done: usize, _total: usize| {};
4283        let update = index
4284            .refresh_invalidated_files(
4285                project_root,
4286                std::slice::from_ref(&deferred),
4287                &mut embed,
4288                16,
4289                1,
4290                &mut progress,
4291            )
4292            .unwrap();
4293
4294        assert_eq!(update.summary.total_processed, 1);
4295        assert_eq!(update.summary.added, 0);
4296        assert_eq!(embedder.total_embedded_texts(), 0);
4297        assert_eq!(index.indexed_file_count(), 1);
4298        assert!(index.deferred_files.contains(&deferred));
4299        assert!(entries_for_file(&index, &deferred).is_empty());
4300    }
4301
4302    #[test]
4303    fn semantic_cache_serialization_skips_paths_outside_project_root() {
4304        let dir = tempfile::tempdir().expect("create temp dir");
4305        let project = fs::canonicalize(dir.path()).expect("canonical project");
4306        let outside = project.join("..").join("outside.rs");
4307        let mut index = SemanticIndex::new(project.clone(), 3);
4308        index
4309            .file_mtimes
4310            .insert(outside.clone(), SystemTime::UNIX_EPOCH);
4311        index.file_sizes.insert(outside.clone(), 1);
4312        index
4313            .file_hashes
4314            .insert(outside.clone(), cache_freshness::zero_hash());
4315        index.entries.push(EmbeddingEntry {
4316            chunk: SemanticChunk {
4317                file: outside,
4318                name: "outside".to_string(),
4319                qualified_name: None,
4320                kind: SymbolKind::Function,
4321                start_line: 0,
4322                end_line: 0,
4323                exported: false,
4324                embed_text: "outside".to_string(),
4325                snippet: "outside".to_string(),
4326            },
4327            vector: vec![1.0, 0.0, 0.0],
4328        });
4329
4330        let bytes = index.to_bytes();
4331        let loaded = SemanticIndex::from_bytes(&bytes, &project).expect("load serialized index");
4332        assert_eq!(loaded.entries.len(), 0);
4333        assert!(loaded.file_mtimes.is_empty());
4334    }
4335
4336    #[test]
4337    fn semantic_search_bounded_top_k_matches_reference_full_sort() {
4338        let project_root = test_project_root();
4339        let file = project_root.join("src/lib.rs");
4340        let mut index = SemanticIndex::new(project_root, 2);
4341        let entries = [
4342            ("alpha", vec![1.0, 0.0], false),
4343            ("beta", vec![0.0, 1.0], false),
4344            ("gamma", vec![1.0, 0.0], false),
4345            ("delta", vec![0.5, 0.5], true),
4346            ("epsilon", vec![-1.0, 0.0], false),
4347        ];
4348        for (line, (name, vector, exported)) in entries.into_iter().enumerate() {
4349            index.entries.push(EmbeddingEntry {
4350                chunk: SemanticChunk {
4351                    file: file.clone(),
4352                    name: name.to_string(),
4353                    qualified_name: None,
4354                    kind: SymbolKind::Function,
4355                    start_line: line as u32 + 1,
4356                    end_line: line as u32 + 1,
4357                    exported,
4358                    embed_text: name.to_string(),
4359                    snippet: format!("fn {name}() {{}}"),
4360                },
4361                vector,
4362            });
4363        }
4364
4365        let query = vec![1.0, 0.0];
4366        let top_k = 4;
4367        let mut reference: Vec<(f32, usize)> = index
4368            .entries
4369            .iter()
4370            .enumerate()
4371            .map(|(idx, entry)| {
4372                let mut score = cosine_similarity(&query, &entry.vector);
4373                if entry.chunk.exported {
4374                    score *= 1.1;
4375                }
4376                (score, idx)
4377            })
4378            .collect();
4379        reference.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
4380        let expected: Vec<(String, f32)> = reference
4381            .into_iter()
4382            .take(top_k)
4383            .map(|(score, idx)| (index.entries[idx].chunk.name.clone(), score))
4384            .collect();
4385
4386        let actual: Vec<(String, f32)> = index
4387            .search(&query, top_k)
4388            .into_iter()
4389            .map(|result| (result.name, result.score))
4390            .collect();
4391
4392        assert_eq!(
4393            actual.iter().map(|(name, _)| name).collect::<Vec<_>>(),
4394            expected.iter().map(|(name, _)| name).collect::<Vec<_>>()
4395        );
4396        for ((_, actual_score), (_, expected_score)) in actual.iter().zip(expected.iter()) {
4397            assert!((actual_score - expected_score).abs() < 1e-6);
4398        }
4399        assert_eq!(actual[0].0, "alpha");
4400        assert_eq!(actual[1].0, "gamma", "equal scores keep insertion order");
4401        assert!(index.search(&query, 0).is_empty());
4402    }
4403
4404    #[test]
4405    fn test_cosine_similarity_identical() {
4406        let a = vec![1.0, 0.0, 0.0];
4407        let b = vec![1.0, 0.0, 0.0];
4408        assert!((cosine_similarity(&a, &b) - 1.0).abs() < 0.001);
4409    }
4410
4411    #[test]
4412    fn test_cosine_similarity_orthogonal() {
4413        let a = vec![1.0, 0.0, 0.0];
4414        let b = vec![0.0, 1.0, 0.0];
4415        assert!(cosine_similarity(&a, &b).abs() < 0.001);
4416    }
4417
4418    #[test]
4419    fn test_cosine_similarity_opposite() {
4420        let a = vec![1.0, 0.0, 0.0];
4421        let b = vec![-1.0, 0.0, 0.0];
4422        assert!((cosine_similarity(&a, &b) + 1.0).abs() < 0.001);
4423    }
4424
4425    #[test]
4426    fn test_serialization_roundtrip() {
4427        let project_root = test_project_root();
4428        let file = project_root.join("src/main.rs");
4429        let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
4430        index.entries.push(EmbeddingEntry {
4431            chunk: SemanticChunk {
4432                file: file.clone(),
4433                name: "handle_request".to_string(),
4434                qualified_name: None,
4435                kind: SymbolKind::Function,
4436                start_line: 10,
4437                end_line: 25,
4438                exported: true,
4439                embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
4440                snippet: "fn handle_request() {\n  // ...\n}".to_string(),
4441            },
4442            vector: vec![0.1, 0.2, 0.3, 0.4],
4443        });
4444        index.dimension = 4;
4445        index
4446            .file_mtimes
4447            .insert(file.clone(), SystemTime::UNIX_EPOCH);
4448        index.file_sizes.insert(file, 0);
4449        index.set_fingerprint(SemanticIndexFingerprint {
4450            backend: "fastembed".to_string(),
4451            model: "all-MiniLM-L6-v2".to_string(),
4452            base_url: FALLBACK_BACKEND.to_string(),
4453            dimension: 4,
4454            chunking_version: default_chunking_version(),
4455        });
4456
4457        let bytes = index.to_bytes();
4458        let restored = SemanticIndex::from_bytes(&bytes, &project_root).unwrap();
4459
4460        assert_eq!(restored.entries.len(), 1);
4461        assert_eq!(restored.entries[0].chunk.name, "handle_request");
4462        assert_eq!(restored.entries[0].vector, vec![0.1, 0.2, 0.3, 0.4]);
4463        assert_eq!(restored.dimension, 4);
4464        assert_eq!(restored.backend_label(), Some("fastembed"));
4465        assert_eq!(restored.model_label(), Some("all-MiniLM-L6-v2"));
4466    }
4467
4468    #[test]
4469    fn semantic_cache_v6_loads_and_v7_round_trips_qualified_names() {
4470        let storage = tempfile::tempdir().expect("create storage dir");
4471        let project = storage.path().join("project");
4472        fs::create_dir_all(project.join("src")).expect("create project src");
4473        let file = project.join("src/lib.rs");
4474        fs::write(&file, "pub fn alpha() {}\npub fn beta() {}\n").expect("write source");
4475        let project_root = fs::canonicalize(&project).expect("canonical project");
4476        let file = fs::canonicalize(&file).expect("canonical file");
4477
4478        let mut index = SemanticIndex::new(project_root.clone(), 3);
4479        let mtime = SystemTime::UNIX_EPOCH + Duration::new(123, 456);
4480        index.file_mtimes.insert(file.clone(), mtime);
4481        index.file_sizes.insert(file.clone(), 42);
4482        index
4483            .file_hashes
4484            .insert(file.clone(), cache_freshness::zero_hash());
4485        index.entries.push(EmbeddingEntry {
4486            chunk: SemanticChunk {
4487                file: file.clone(),
4488                name: "alpha".to_string(),
4489                qualified_name: Some("Service.alpha".to_string()),
4490                kind: SymbolKind::Function,
4491                start_line: 0,
4492                end_line: 0,
4493                exported: true,
4494                embed_text: "file:src/lib.rs kind:function name:alpha".to_string(),
4495                snippet: "pub fn alpha() {}".to_string(),
4496            },
4497            vector: vec![0.1, 0.2, 0.3],
4498        });
4499        index.entries.push(EmbeddingEntry {
4500            chunk: SemanticChunk {
4501                file: file.clone(),
4502                name: "beta".to_string(),
4503                qualified_name: Some("Service.beta".to_string()),
4504                kind: SymbolKind::Function,
4505                start_line: 1,
4506                end_line: 1,
4507                exported: true,
4508                embed_text: "file:src/lib.rs kind:function name:beta".to_string(),
4509                snippet: "pub fn beta() {}".to_string(),
4510            },
4511            vector: vec![0.4, 0.5, 0.6],
4512        });
4513        let fingerprint = SemanticIndexFingerprint {
4514            backend: "fastembed".to_string(),
4515            model: "all-MiniLM-L6-v2".to_string(),
4516            base_url: FALLBACK_BACKEND.to_string(),
4517            dimension: 3,
4518            chunking_version: default_chunking_version(),
4519        };
4520        let fingerprint_before = fingerprint.as_string();
4521        index.set_fingerprint(fingerprint.clone());
4522
4523        let legacy_bytes = legacy_semantic_index_bytes(&index);
4524        assert_eq!(legacy_bytes[0], SEMANTIC_INDEX_VERSION_V6);
4525        let legacy_dir = storage.path().join("semantic/legacy-proj");
4526        fs::create_dir_all(&legacy_dir).expect("create legacy semantic dir");
4527        let legacy_path = legacy_dir.join("semantic.bin");
4528        fs::write(&legacy_path, &legacy_bytes).expect("write legacy semantic.bin");
4529        let legacy_loaded = SemanticIndex::read_from_disk(
4530            storage.path(),
4531            "legacy-proj",
4532            &project_root,
4533            false,
4534            Some(&fingerprint_before),
4535        )
4536        .expect("load v6 semantic index");
4537        assert!(
4538            legacy_path.exists(),
4539            "compatible V6 cache must not be deleted"
4540        );
4541        assert!(legacy_loaded
4542            .entries
4543            .iter()
4544            .all(|entry| entry.chunk.qualified_name.is_none()));
4545        assert_eq!(
4546            legacy_loaded.fingerprint().unwrap().as_string(),
4547            fingerprint_before
4548        );
4549
4550        let v7_bytes = index.to_bytes();
4551        assert_eq!(v7_bytes[0], SEMANTIC_INDEX_VERSION_V7);
4552        assert_ne!(v7_bytes, legacy_bytes);
4553        let restored = SemanticIndex::from_bytes(&v7_bytes, &project_root).unwrap();
4554        assert_eq!(
4555            restored.entries[0].chunk.qualified_name.as_deref(),
4556            Some("Service.alpha")
4557        );
4558        assert_eq!(
4559            restored.entries[1].chunk.qualified_name.as_deref(),
4560            Some("Service.beta")
4561        );
4562        assert_eq!(
4563            restored.fingerprint().unwrap().as_string(),
4564            fingerprint_before
4565        );
4566
4567        index.write_to_disk(storage.path(), "proj");
4568        let data_path = storage.path().join("semantic/proj/semantic.bin");
4569        let persisted = fs::read(&data_path).expect("read semantic.bin");
4570        assert_eq!(persisted[0], SEMANTIC_INDEX_VERSION_V7);
4571
4572        let loaded = SemanticIndex::read_from_disk(
4573            storage.path(),
4574            "proj",
4575            &project_root,
4576            false,
4577            Some(&fingerprint_before),
4578        )
4579        .expect("load semantic index");
4580        assert_eq!(loaded.entries.len(), index.entries.len());
4581        assert_eq!(loaded.dimension, index.dimension);
4582        assert_eq!(
4583            loaded.fingerprint().unwrap().as_string(),
4584            fingerprint_before
4585        );
4586        assert_eq!(loaded.file_mtimes.get(&file), Some(&mtime));
4587        assert_eq!(loaded.file_sizes.get(&file), Some(&42));
4588        assert_eq!(
4589            loaded.file_hashes.get(&file),
4590            Some(&cache_freshness::zero_hash())
4591        );
4592        for (actual, expected) in loaded.entries.iter().zip(index.entries.iter()) {
4593            assert_eq!(actual.chunk.file, expected.chunk.file);
4594            assert_eq!(actual.chunk.name, expected.chunk.name);
4595            assert_eq!(actual.chunk.qualified_name, expected.chunk.qualified_name);
4596            assert_eq!(actual.chunk.kind, expected.chunk.kind);
4597            assert_eq!(actual.chunk.start_line, expected.chunk.start_line);
4598            assert_eq!(actual.chunk.end_line, expected.chunk.end_line);
4599            assert_eq!(actual.chunk.exported, expected.chunk.exported);
4600            assert_eq!(actual.chunk.embed_text, expected.chunk.embed_text);
4601            assert_eq!(actual.chunk.snippet, expected.chunk.snippet);
4602            assert_eq!(actual.vector, expected.vector);
4603        }
4604        assert_eq!(loaded.to_bytes(), persisted);
4605        assert_eq!(fingerprint.as_string(), fingerprint_before);
4606    }
4607
4608    #[test]
4609    fn symbol_kind_serialization_roundtrip_includes_file_summary_variant() {
4610        let cases = [
4611            (SymbolKind::Function, 0),
4612            (SymbolKind::Class, 1),
4613            (SymbolKind::Method, 2),
4614            (SymbolKind::Struct, 3),
4615            (SymbolKind::Interface, 4),
4616            (SymbolKind::Enum, 5),
4617            (SymbolKind::TypeAlias, 6),
4618            (SymbolKind::Variable, 7),
4619            (SymbolKind::Heading, 8),
4620            (SymbolKind::FileSummary, 9),
4621        ];
4622
4623        for (kind, encoded) in cases {
4624            assert_eq!(symbol_kind_to_u8(&kind), encoded);
4625            assert_eq!(u8_to_symbol_kind(encoded), kind);
4626        }
4627    }
4628
4629    #[test]
4630    fn test_search_top_k() {
4631        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4632        index.dimension = 3;
4633
4634        // Add entries with known vectors
4635        for (i, name) in ["auth", "database", "handler"].iter().enumerate() {
4636            let mut vec = vec![0.0f32; 3];
4637            vec[i] = 1.0; // orthogonal vectors
4638            index.entries.push(EmbeddingEntry {
4639                chunk: SemanticChunk {
4640                    file: PathBuf::from("/src/lib.rs"),
4641                    name: name.to_string(),
4642                    qualified_name: None,
4643                    kind: SymbolKind::Function,
4644                    start_line: (i * 10 + 1) as u32,
4645                    end_line: (i * 10 + 5) as u32,
4646                    exported: true,
4647                    embed_text: format!("kind:function name:{}", name),
4648                    snippet: format!("fn {}() {{}}", name),
4649                },
4650                vector: vec,
4651            });
4652        }
4653
4654        // Query aligned with "auth" (index 0)
4655        let query = vec![0.9, 0.1, 0.0];
4656        let results = index.search(&query, 2);
4657
4658        assert_eq!(results.len(), 2);
4659        assert_eq!(results[0].name, "auth"); // highest score
4660        assert!(results[0].score > results[1].score);
4661    }
4662
4663    #[test]
4664    fn test_empty_index_search() {
4665        let index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4666        let results = index.search(&[0.1, 0.2, 0.3], 10);
4667        assert!(results.is_empty());
4668    }
4669
4670    #[test]
4671    fn single_line_symbol_builds_non_empty_snippet() {
4672        let symbol = Symbol {
4673            name: "answer".to_string(),
4674            kind: SymbolKind::Variable,
4675            range: crate::symbols::Range {
4676                start_line: 0,
4677                start_col: 0,
4678                end_line: 0,
4679                end_col: 24,
4680            },
4681            signature: Some("const answer = 42".to_string()),
4682            scope_chain: Vec::new(),
4683            exported: true,
4684            parent: None,
4685        };
4686        let source = "export const answer = 42;\n";
4687
4688        let snippet = build_snippet(&symbol, source);
4689
4690        assert_eq!(snippet, "export const answer = 42;");
4691    }
4692
4693    #[test]
4694    fn optimized_file_chunk_collection_matches_file_parser_path() {
4695        let project_root = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
4696        let file = project_root.join("src/semantic_index.rs");
4697        let source = std::fs::read_to_string(&file).unwrap();
4698
4699        let mut legacy_parser = FileParser::new();
4700        let legacy_symbols = legacy_parser.extract_symbols(&file).unwrap();
4701        let legacy_chunks = symbols_to_chunks(&file, &legacy_symbols, &source, &project_root);
4702
4703        let mut parsers = HashMap::new();
4704        let optimized_chunks = collect_file_chunks(&project_root, &file, &mut parsers).unwrap();
4705
4706        assert_eq!(
4707            chunk_fingerprint(&optimized_chunks),
4708            chunk_fingerprint(&legacy_chunks)
4709        );
4710    }
4711
4712    #[test]
4713    fn collect_file_chunks_indexes_java_symbols() {
4714        let dir = tempfile::tempdir().unwrap();
4715        let file = dir.path().join("Greeter.java");
4716        std::fs::write(
4717            &file,
4718            r#"package example;
4719
4720public class Greeter {
4721    public String greet(String name) {
4722        return "Hello, " + name;
4723    }
4724}
4725"#,
4726        )
4727        .unwrap();
4728
4729        let mut parsers = HashMap::new();
4730        let chunks = collect_file_chunks(dir.path(), &file, &mut parsers).unwrap();
4731
4732        assert!(
4733            !chunks.is_empty(),
4734            "Java file should produce semantic chunks"
4735        );
4736        assert!(
4737            chunks
4738                .iter()
4739                .any(|chunk| chunk.name == "Greeter" && chunk.kind == SymbolKind::Class),
4740            "Java class symbol should be chunked: {chunks:?}"
4741        );
4742        assert!(
4743            chunks
4744                .iter()
4745                .any(|chunk| chunk.name == "greet" && chunk.kind == SymbolKind::Method),
4746            "Java method symbol should be chunked: {chunks:?}"
4747        );
4748    }
4749
4750    fn chunk_fingerprint(
4751        chunks: &[SemanticChunk],
4752    ) -> Vec<(String, SymbolKind, u32, u32, bool, String, String)> {
4753        chunks
4754            .iter()
4755            .map(|chunk| {
4756                (
4757                    chunk.name.clone(),
4758                    chunk.kind.clone(),
4759                    chunk.start_line,
4760                    chunk.end_line,
4761                    chunk.exported,
4762                    chunk.embed_text.clone(),
4763                    chunk.snippet.clone(),
4764                )
4765            })
4766            .collect()
4767    }
4768
4769    #[test]
4770    fn collect_file_chunks_skips_oversized_file() {
4771        let dir = tempfile::tempdir().unwrap();
4772        let big = dir.path().join("huge.ts");
4773        // Just over the cap: a valid TS file that would otherwise yield chunks.
4774        let filler = "export const x = 1;\n"
4775            .repeat(((MAX_SEMANTIC_FILE_BYTES as usize) / "export const x = 1;\n".len()) + 16);
4776        std::fs::write(&big, &filler).unwrap();
4777        assert!(big.metadata().unwrap().len() > MAX_SEMANTIC_FILE_BYTES);
4778
4779        let mut parsers = HashMap::new();
4780        // Oversized → tracked with zero chunks, NOT an error (so the caller keeps
4781        // the file in metadata and freshness skips re-reading it).
4782        let chunks = collect_file_chunks(dir.path(), &big, &mut parsers).unwrap();
4783        assert!(chunks.is_empty(), "oversized file must yield no chunks");
4784
4785        // A small file of the same language still produces chunks.
4786        let small = dir.path().join("small.ts");
4787        std::fs::write(&small, "export function foo() { return 1; }\n").unwrap();
4788        let small_chunks = collect_file_chunks(dir.path(), &small, &mut parsers).unwrap();
4789        assert!(!small_chunks.is_empty(), "small file should still chunk");
4790    }
4791
4792    #[test]
4793    fn rejects_oversized_dimension_during_deserialization() {
4794        let mut bytes = Vec::new();
4795        bytes.push(1u8);
4796        bytes.extend_from_slice(&((MAX_DIMENSION as u32) + 1).to_le_bytes());
4797        bytes.extend_from_slice(&0u32.to_le_bytes());
4798        bytes.extend_from_slice(&0u32.to_le_bytes());
4799
4800        assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
4801    }
4802
4803    #[test]
4804    fn rejects_oversized_entry_count_during_deserialization() {
4805        let mut bytes = Vec::new();
4806        bytes.push(1u8);
4807        bytes.extend_from_slice(&(DEFAULT_DIMENSION as u32).to_le_bytes());
4808        bytes.extend_from_slice(&((MAX_ENTRIES as u32) + 1).to_le_bytes());
4809        bytes.extend_from_slice(&0u32.to_le_bytes());
4810
4811        assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
4812    }
4813
4814    #[test]
4815    fn invalidate_file_removes_entries_and_mtime() {
4816        let target = PathBuf::from("/src/main.rs");
4817        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4818        index.entries.push(EmbeddingEntry {
4819            chunk: SemanticChunk {
4820                file: target.clone(),
4821                name: "main".to_string(),
4822                qualified_name: None,
4823                kind: SymbolKind::Function,
4824                start_line: 0,
4825                end_line: 1,
4826                exported: false,
4827                embed_text: "main".to_string(),
4828                snippet: "fn main() {}".to_string(),
4829            },
4830            vector: vec![1.0; DEFAULT_DIMENSION],
4831        });
4832        index
4833            .file_mtimes
4834            .insert(target.clone(), SystemTime::UNIX_EPOCH);
4835        index.file_sizes.insert(target.clone(), 0);
4836
4837        index.invalidate_file(&target);
4838
4839        assert!(index.entries.is_empty());
4840        assert!(!index.file_mtimes.contains_key(&target));
4841        assert!(!index.file_sizes.contains_key(&target));
4842    }
4843
4844    #[test]
4845    fn refresh_missing_changed_file_is_purged_after_collect() {
4846        let temp = tempfile::tempdir().unwrap();
4847        let project_root = temp.path();
4848        let file = project_root.join("src/lib.rs");
4849        fs::create_dir_all(file.parent().unwrap()).unwrap();
4850        write_rust_file(&file, "vanished_symbol");
4851
4852        let mut index = build_test_index(project_root, std::slice::from_ref(&file));
4853        let original_size = *index.file_sizes.get(&file).unwrap();
4854        set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, original_size + 1);
4855        fs::remove_file(&file).unwrap();
4856
4857        let mut embed = test_vector_for_texts;
4858        let mut progress = |_done: usize, _total: usize| {};
4859        let summary = index
4860            .refresh_stale_files(
4861                project_root,
4862                std::slice::from_ref(&file),
4863                &mut embed,
4864                8,
4865                &mut progress,
4866            )
4867            .unwrap();
4868
4869        assert_eq!(summary.changed, 0);
4870        assert_eq!(summary.added, 0);
4871        assert_eq!(summary.deleted, 1);
4872        assert!(index.entries.is_empty());
4873        assert!(!index.file_mtimes.contains_key(&file));
4874        assert!(!index.file_sizes.contains_key(&file));
4875        assert!(!index.file_hashes.contains_key(&file));
4876    }
4877
4878    #[test]
4879    fn refresh_collect_error_for_existing_path_preserves_cached_entry() {
4880        let temp = tempfile::tempdir().unwrap();
4881        let project_root = temp.path();
4882        let file = project_root.join("src/lib.rs");
4883        fs::create_dir_all(file.parent().unwrap()).unwrap();
4884        write_rust_file(&file, "kept_symbol");
4885
4886        let mut index = build_test_index(project_root, std::slice::from_ref(&file));
4887        let original_entry_count = index.entries.len();
4888        let original_mtime = *index.file_mtimes.get(&file).unwrap();
4889        let original_size = *index.file_sizes.get(&file).unwrap();
4890
4891        let stale_mtime = SystemTime::UNIX_EPOCH;
4892        set_file_metadata(&mut index, &file, stale_mtime, original_size + 1);
4893        fs::remove_file(&file).unwrap();
4894        fs::create_dir(&file).unwrap();
4895
4896        let mut embed = test_vector_for_texts;
4897        let mut progress = |_done: usize, _total: usize| {};
4898        let summary = index
4899            .refresh_stale_files(
4900                project_root,
4901                std::slice::from_ref(&file),
4902                &mut embed,
4903                8,
4904                &mut progress,
4905            )
4906            .unwrap();
4907
4908        assert_eq!(summary.changed, 0);
4909        assert_eq!(summary.added, 0);
4910        assert_eq!(summary.deleted, 0);
4911        assert_eq!(index.entries.len(), original_entry_count);
4912        assert!(index
4913            .entries
4914            .iter()
4915            .any(|entry| entry.chunk.name == "kept_symbol"));
4916        assert_eq!(index.file_mtimes.get(&file), Some(&stale_mtime));
4917        assert_ne!(index.file_mtimes.get(&file), Some(&original_mtime));
4918        assert_eq!(index.file_sizes.get(&file), Some(&(original_size + 1)));
4919    }
4920
4921    #[test]
4922    fn refresh_never_indexed_file_error_does_not_record_mtime() {
4923        let temp = tempfile::tempdir().unwrap();
4924        let project_root = temp.path();
4925        let missing = project_root.join("src/missing.rs");
4926        fs::create_dir_all(missing.parent().unwrap()).unwrap();
4927
4928        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4929        let mut embed = test_vector_for_texts;
4930        let mut progress = |_done: usize, _total: usize| {};
4931        let summary = index
4932            .refresh_stale_files(
4933                project_root,
4934                std::slice::from_ref(&missing),
4935                &mut embed,
4936                8,
4937                &mut progress,
4938            )
4939            .unwrap();
4940
4941        assert_eq!(summary.added, 0);
4942        assert_eq!(summary.changed, 0);
4943        assert_eq!(summary.deleted, 0);
4944        assert!(!index.file_mtimes.contains_key(&missing));
4945        assert!(!index.file_sizes.contains_key(&missing));
4946        assert!(index.entries.is_empty());
4947    }
4948
4949    #[test]
4950    fn refresh_reports_added_for_new_files() {
4951        let temp = tempfile::tempdir().unwrap();
4952        let project_root = temp.path();
4953        let existing = project_root.join("src/lib.rs");
4954        let added = project_root.join("src/new.rs");
4955        fs::create_dir_all(existing.parent().unwrap()).unwrap();
4956        write_rust_file(&existing, "existing_symbol");
4957        write_rust_file(&added, "added_symbol");
4958
4959        let mut index = build_test_index(project_root, std::slice::from_ref(&existing));
4960        let mut embed = test_vector_for_texts;
4961        let mut progress = |_done: usize, _total: usize| {};
4962        let summary = index
4963            .refresh_stale_files(
4964                project_root,
4965                &[existing.clone(), added.clone()],
4966                &mut embed,
4967                8,
4968                &mut progress,
4969            )
4970            .unwrap();
4971
4972        assert_eq!(summary.added, 1);
4973        assert_eq!(summary.changed, 0);
4974        assert_eq!(summary.deleted, 0);
4975        assert_eq!(summary.total_processed, 2);
4976        assert!(index.file_mtimes.contains_key(&added));
4977        assert!(index.entries.iter().any(|entry| entry.chunk.file == added));
4978    }
4979
4980    #[test]
4981    fn refresh_reports_deleted_for_removed_files() {
4982        let temp = tempfile::tempdir().unwrap();
4983        let project_root = temp.path();
4984        let deleted = project_root.join("src/deleted.rs");
4985        fs::create_dir_all(deleted.parent().unwrap()).unwrap();
4986        write_rust_file(&deleted, "deleted_symbol");
4987
4988        let mut index = build_test_index(project_root, std::slice::from_ref(&deleted));
4989        fs::remove_file(&deleted).unwrap();
4990
4991        let mut embed = test_vector_for_texts;
4992        let mut progress = |_done: usize, _total: usize| {};
4993        let summary = index
4994            .refresh_stale_files(project_root, &[], &mut embed, 8, &mut progress)
4995            .unwrap();
4996
4997        assert_eq!(summary.deleted, 1);
4998        assert_eq!(summary.changed, 0);
4999        assert_eq!(summary.added, 0);
5000        assert_eq!(summary.total_processed, 1);
5001        assert!(!index.file_mtimes.contains_key(&deleted));
5002        assert!(index.entries.is_empty());
5003    }
5004
5005    #[test]
5006    fn refresh_reports_changed_for_modified_files() {
5007        let temp = tempfile::tempdir().unwrap();
5008        let project_root = temp.path();
5009        let file = project_root.join("src/lib.rs");
5010        fs::create_dir_all(file.parent().unwrap()).unwrap();
5011        write_rust_file(&file, "old_symbol");
5012
5013        let mut index = build_test_index(project_root, std::slice::from_ref(&file));
5014        set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, 0);
5015        write_rust_file(&file, "new_symbol");
5016
5017        let mut embed = test_vector_for_texts;
5018        let mut progress = |_done: usize, _total: usize| {};
5019        let summary = index
5020            .refresh_stale_files(
5021                project_root,
5022                std::slice::from_ref(&file),
5023                &mut embed,
5024                8,
5025                &mut progress,
5026            )
5027            .unwrap();
5028
5029        assert_eq!(summary.changed, 1);
5030        assert_eq!(summary.added, 0);
5031        assert_eq!(summary.deleted, 0);
5032        assert_eq!(summary.total_processed, 1);
5033        assert!(index
5034            .entries
5035            .iter()
5036            .any(|entry| entry.chunk.name == "new_symbol"));
5037        assert!(!index
5038            .entries
5039            .iter()
5040            .any(|entry| entry.chunk.name == "old_symbol"));
5041    }
5042
5043    #[test]
5044    fn refresh_all_clean_reports_zero_counts_and_no_embedding_work() {
5045        let temp = tempfile::tempdir().unwrap();
5046        let project_root = temp.path();
5047        let file = project_root.join("src/lib.rs");
5048        fs::create_dir_all(file.parent().unwrap()).unwrap();
5049        write_rust_file(&file, "clean_symbol");
5050
5051        let mut index = build_test_index(project_root, std::slice::from_ref(&file));
5052        let original_entries = index.entries.len();
5053        let mut embed_called = false;
5054        let mut embed = |texts: Vec<String>| {
5055            embed_called = true;
5056            test_vector_for_texts(texts)
5057        };
5058        let mut progress = |_done: usize, _total: usize| {};
5059        let summary = index
5060            .refresh_stale_files(
5061                project_root,
5062                std::slice::from_ref(&file),
5063                &mut embed,
5064                8,
5065                &mut progress,
5066            )
5067            .unwrap();
5068
5069        assert!(summary.is_noop());
5070        assert_eq!(summary.total_processed, 1);
5071        assert!(!embed_called);
5072        assert_eq!(index.entries.len(), original_entries);
5073    }
5074
5075    #[test]
5076    fn detects_missing_onnx_runtime_from_dynamic_load_error() {
5077        let message = "Failed to load ONNX Runtime shared library libonnxruntime.dylib via dlopen: no such file";
5078
5079        assert!(is_onnx_runtime_unavailable(message));
5080    }
5081
5082    #[test]
5083    fn formats_missing_onnx_runtime_with_install_hint() {
5084        let message = format_embedding_init_error(
5085            "Failed to load ONNX Runtime shared library libonnxruntime.so via dlopen: no such file",
5086        );
5087
5088        assert!(message.starts_with("ONNX Runtime not found. Install via:"));
5089        assert!(message.contains("Original error:"));
5090    }
5091
5092    #[test]
5093    fn interactive_query_embedding_model_caps_remote_timeout() {
5094        let mut config = SemanticBackendConfig {
5095            backend: SemanticBackend::OpenAiCompatible,
5096            model: "test-embedding".to_string(),
5097            base_url: Some("http://127.0.0.1:9".to_string()),
5098            api_key_env: None,
5099            timeout_ms: 0,
5100            max_batch_size: 64,
5101            max_files: 20_000,
5102        };
5103
5104        let build_model = SemanticEmbeddingModel::from_config(&config).unwrap();
5105        let query_model = SemanticEmbeddingModel::from_config_for_query(&config).unwrap();
5106        assert_eq!(
5107            build_model.timeout_ms(),
5108            DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS,
5109            "background build keeps the longer default embedding timeout"
5110        );
5111        assert_eq!(
5112            query_model.timeout_ms(),
5113            DEFAULT_QUERY_EMBEDDING_TIMEOUT_MS,
5114            "interactive query embedding is capped below the dispatch transport timeout"
5115        );
5116
5117        config.timeout_ms = 60_000;
5118        let query_model = SemanticEmbeddingModel::from_config_for_query(&config).unwrap();
5119        assert_eq!(
5120            query_model.timeout_ms(),
5121            DEFAULT_QUERY_EMBEDDING_TIMEOUT_MS,
5122            "explicitly long backend timeouts are capped for interactive queries"
5123        );
5124
5125        config.timeout_ms = 3_000;
5126        let query_model = SemanticEmbeddingModel::from_config_for_query(&config).unwrap();
5127        assert_eq!(
5128            query_model.timeout_ms(),
5129            3_000,
5130            "shorter explicit timeouts are respected for interactive queries"
5131        );
5132    }
5133
5134    #[test]
5135    fn openai_compatible_backend_embeds_with_mock_server() {
5136        let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
5137            assert!(request_line.starts_with("POST "));
5138            assert_eq!(path, "/v1/embeddings");
5139            "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0},{\"embedding\":[0.4,0.5,0.6],\"index\":1}]}".to_string()
5140        });
5141
5142        let config = SemanticBackendConfig {
5143            backend: SemanticBackend::OpenAiCompatible,
5144            model: "test-embedding".to_string(),
5145            base_url: Some(base_url),
5146            api_key_env: None,
5147            timeout_ms: 5_000,
5148            max_batch_size: 64,
5149            max_files: 20_000,
5150        };
5151
5152        let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
5153        let vectors = model
5154            .embed(vec!["hello".to_string(), "world".to_string()])
5155            .unwrap();
5156
5157        assert_eq!(vectors, vec![vec![0.1, 0.2, 0.3], vec![0.4, 0.5, 0.6]]);
5158        handle.join().unwrap();
5159    }
5160
5161    /// Regression for issue #36: AFT was sending TWO Content-Type headers
5162    /// on the OpenAI embeddings request — once implicitly via `.json(&body)`
5163    /// and again explicitly via `.header("Content-Type", "application/json")`.
5164    /// reqwest's `.header()` calls `HeaderMap::append`, which produces two
5165    /// headers on the wire. OpenAI's /v1/embeddings endpoint rejects that
5166    /// with `HTTP 400 "you must provide a model parameter"` even though the
5167    /// body actually contains `model`. The fix is to drop the explicit
5168    /// `.header("Content-Type", ...)` call. This test pins that we send
5169    /// exactly one Content-Type header.
5170    #[test]
5171    fn openai_compatible_request_has_single_content_type_header() {
5172        use std::sync::{Arc, Mutex};
5173        let captured: Arc<Mutex<Vec<u8>>> = Arc::new(Mutex::new(Vec::new()));
5174        let captured_for_thread = Arc::clone(&captured);
5175
5176        let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
5177        let addr = listener.local_addr().expect("local addr");
5178        let handle = thread::spawn(move || {
5179            let (mut stream, _) = listener.accept().expect("accept");
5180            let mut buf = Vec::new();
5181            let mut chunk = [0u8; 4096];
5182            let mut header_end = None;
5183            let mut content_length = 0usize;
5184            loop {
5185                let n = stream.read(&mut chunk).expect("read");
5186                if n == 0 {
5187                    break;
5188                }
5189                buf.extend_from_slice(&chunk[..n]);
5190                if header_end.is_none() {
5191                    if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
5192                        header_end = Some(pos + 4);
5193                        for line in String::from_utf8_lossy(&buf[..pos + 4]).lines() {
5194                            if let Some(value) = line.strip_prefix("Content-Length:") {
5195                                content_length = value.trim().parse::<usize>().unwrap_or(0);
5196                            }
5197                        }
5198                    }
5199                }
5200                if let Some(end) = header_end {
5201                    if buf.len() >= end + content_length {
5202                        break;
5203                    }
5204                }
5205            }
5206            *captured_for_thread.lock().unwrap() = buf;
5207            let body = "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0}]}";
5208            let response = format!(
5209                "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
5210                body.len(),
5211                body
5212            );
5213            let _ = stream.write_all(response.as_bytes());
5214        });
5215
5216        let config = SemanticBackendConfig {
5217            backend: SemanticBackend::OpenAiCompatible,
5218            model: "text-embedding-3-small".to_string(),
5219            base_url: Some(format!("http://{}", addr)),
5220            api_key_env: None,
5221            timeout_ms: 5_000,
5222            max_batch_size: 64,
5223            max_files: 20_000,
5224        };
5225        let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
5226        let _ = model.embed(vec!["probe".to_string()]).unwrap();
5227        handle.join().unwrap();
5228
5229        let bytes = captured.lock().unwrap().clone();
5230        let request = String::from_utf8_lossy(&bytes);
5231
5232        // Lowercase line counts because HTTP headers are case-insensitive
5233        // and reqwest may emit `content-type` in lowercase under HTTP/2.
5234        let content_type_lines = request
5235            .lines()
5236            .filter(|line| {
5237                let lower = line.to_ascii_lowercase();
5238                lower.starts_with("content-type:")
5239            })
5240            .count();
5241        assert_eq!(
5242            content_type_lines, 1,
5243            "expected exactly one Content-Type header but found {content_type_lines}; full request:\n{request}",
5244        );
5245
5246        // The body must still include the model field — pin this so a future
5247        // change can't accidentally drop `model` while fixing duplicate headers.
5248        assert!(
5249            request.contains(r#""model":"text-embedding-3-small""#),
5250            "request body should contain model field; full request:\n{request}",
5251        );
5252    }
5253
5254    #[test]
5255    fn ollama_backend_embeds_with_mock_server() {
5256        let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
5257            assert!(request_line.starts_with("POST "));
5258            assert_eq!(path, "/api/embed");
5259            "{\"embeddings\":[[0.7,0.8,0.9],[1.0,1.1,1.2]]}".to_string()
5260        });
5261
5262        let config = SemanticBackendConfig {
5263            backend: SemanticBackend::Ollama,
5264            model: "embeddinggemma".to_string(),
5265            base_url: Some(base_url),
5266            api_key_env: None,
5267            timeout_ms: 5_000,
5268            max_batch_size: 64,
5269            max_files: 20_000,
5270        };
5271
5272        let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
5273        let vectors = model
5274            .embed(vec!["hello".to_string(), "world".to_string()])
5275            .unwrap();
5276
5277        assert_eq!(vectors, vec![vec![0.7, 0.8, 0.9], vec![1.0, 1.1, 1.2]]);
5278        handle.join().unwrap();
5279    }
5280
5281    #[test]
5282    fn read_from_disk_rejects_fingerprint_mismatch() {
5283        let storage = tempfile::tempdir().unwrap();
5284        let project_key = "proj";
5285
5286        let project_root = test_project_root();
5287        let file = project_root.join("src/main.rs");
5288        let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
5289        index.entries.push(EmbeddingEntry {
5290            chunk: SemanticChunk {
5291                file: file.clone(),
5292                name: "handle_request".to_string(),
5293                qualified_name: None,
5294                kind: SymbolKind::Function,
5295                start_line: 10,
5296                end_line: 25,
5297                exported: true,
5298                embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
5299                snippet: "fn handle_request() {}".to_string(),
5300            },
5301            vector: vec![0.1, 0.2, 0.3],
5302        });
5303        index.dimension = 3;
5304        index
5305            .file_mtimes
5306            .insert(file.clone(), SystemTime::UNIX_EPOCH);
5307        index.file_sizes.insert(file, 0);
5308        index.set_fingerprint(SemanticIndexFingerprint {
5309            backend: "openai_compatible".to_string(),
5310            model: "test-embedding".to_string(),
5311            base_url: "http://127.0.0.1:1234/v1".to_string(),
5312            dimension: 3,
5313            chunking_version: default_chunking_version(),
5314        });
5315        index.write_to_disk(storage.path(), project_key);
5316
5317        let matching = index.fingerprint().unwrap().as_string();
5318        assert!(SemanticIndex::read_from_disk(
5319            storage.path(),
5320            project_key,
5321            &project_root,
5322            false,
5323            Some(&matching),
5324        )
5325        .is_some());
5326
5327        let mismatched = SemanticIndexFingerprint {
5328            backend: "ollama".to_string(),
5329            model: "embeddinggemma".to_string(),
5330            base_url: "http://127.0.0.1:11434".to_string(),
5331            dimension: 3,
5332            chunking_version: default_chunking_version(),
5333        }
5334        .as_string();
5335        assert!(SemanticIndex::read_from_disk(
5336            storage.path(),
5337            project_key,
5338            &project_root,
5339            false,
5340            Some(&mismatched),
5341        )
5342        .is_none());
5343    }
5344
5345    #[test]
5346    fn read_from_disk_rejects_v3_cache_for_snippet_rebuild() {
5347        let storage = tempfile::tempdir().unwrap();
5348        let project_key = "proj-v3";
5349        let dir = storage.path().join("semantic").join(project_key);
5350        fs::create_dir_all(&dir).unwrap();
5351
5352        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
5353        index.entries.push(EmbeddingEntry {
5354            chunk: SemanticChunk {
5355                file: PathBuf::from("/src/main.rs"),
5356                name: "handle_request".to_string(),
5357                qualified_name: None,
5358                kind: SymbolKind::Function,
5359                start_line: 0,
5360                end_line: 0,
5361                exported: true,
5362                embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
5363                snippet: "fn handle_request() {}".to_string(),
5364            },
5365            vector: vec![0.1, 0.2, 0.3],
5366        });
5367        index.dimension = 3;
5368        index
5369            .file_mtimes
5370            .insert(PathBuf::from("/src/main.rs"), SystemTime::UNIX_EPOCH);
5371        index.file_sizes.insert(PathBuf::from("/src/main.rs"), 0);
5372        let fingerprint = SemanticIndexFingerprint {
5373            backend: "fastembed".to_string(),
5374            model: "test".to_string(),
5375            base_url: FALLBACK_BACKEND.to_string(),
5376            dimension: 3,
5377            chunking_version: default_chunking_version(),
5378        };
5379        index.set_fingerprint(fingerprint.clone());
5380
5381        let mut bytes = index.to_bytes();
5382        bytes[0] = SEMANTIC_INDEX_VERSION_V3;
5383        fs::write(dir.join("semantic.bin"), bytes).unwrap();
5384
5385        assert!(SemanticIndex::read_from_disk(
5386            storage.path(),
5387            project_key,
5388            &test_project_root(),
5389            false,
5390            Some(&fingerprint.as_string())
5391        )
5392        .is_none());
5393        assert!(!dir.join("semantic.bin").exists());
5394    }
5395
5396    fn make_symbol(kind: SymbolKind, name: &str, start: u32, end: u32) -> crate::symbols::Symbol {
5397        crate::symbols::Symbol {
5398            name: name.to_string(),
5399            kind,
5400            range: crate::symbols::Range {
5401                start_line: start,
5402                start_col: 0,
5403                end_line: end,
5404                end_col: 0,
5405            },
5406            signature: None,
5407            scope_chain: Vec::new(),
5408            exported: false,
5409            parent: None,
5410        }
5411    }
5412
5413    #[test]
5414    fn symbols_to_chunks_sets_qualified_name_without_changing_embed_text() {
5415        let project_root = PathBuf::from("/proj");
5416        let file = project_root.join("src/engine.ts");
5417        let source = "class Index {\n}\n";
5418        let mut symbol = make_symbol(SymbolKind::Class, "Index", 0, 1);
5419        symbol.scope_chain = vec!["Engine".to_string()];
5420        symbol.signature = Some("class Index".to_string());
5421        let embed_text = build_embed_text(&symbol, source, &file, &project_root);
5422
5423        let chunks = symbols_to_chunks(&file, &[symbol], source, &project_root);
5424        let chunk = chunks
5425            .iter()
5426            .find(|chunk| chunk.name == "Index")
5427            .expect("class chunk");
5428
5429        assert_eq!(chunk.name, "Index");
5430        assert_eq!(chunk.qualified_name.as_deref(), Some("Engine.Index"));
5431        assert_eq!(chunk.embed_text, embed_text);
5432        assert!(!chunk.embed_text.contains("Engine.Index"));
5433    }
5434
5435    /// Heading symbols (Markdown / HTML headings) must NOT be indexed —
5436    /// they overwhelmingly dominated semantic results even on code-shaped
5437    /// queries because heading prose embeds far more strongly than code
5438    /// chunks. Skipping headings keeps aft_search a code-finder.
5439    #[test]
5440    fn symbols_to_chunks_skips_heading_symbols() {
5441        let project_root = PathBuf::from("/proj");
5442        let file = project_root.join("README.md");
5443        let source = "# Title\n\nbody text\n\n## Section\n\nmore text\n";
5444
5445        let symbols = vec![
5446            make_symbol(SymbolKind::Heading, "Title", 0, 2),
5447            make_symbol(SymbolKind::Heading, "Section", 4, 6),
5448        ];
5449
5450        let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
5451        assert!(
5452            chunks.is_empty(),
5453            "Heading symbols must be filtered out before embedding; got {} chunk(s)",
5454            chunks.len()
5455        );
5456    }
5457
5458    /// A symbol with an enormous signature (e.g. a YAML/Kubernetes CronJob
5459    /// whose inline `command:` script is parsed into the signature) must not
5460    /// produce an embed_text that overflows the embedding backend's physical
5461    /// batch. Before the clamp, the unbounded `signature:` append created a
5462    /// multi-KB input that aborted the whole index build and degraded every
5463    /// search to lexical-only.
5464    #[test]
5465    fn build_embed_text_clamps_oversized_signature() {
5466        let project_root = PathBuf::from("/proj");
5467        let file = project_root.join("cronjob.yaml");
5468        let huge_sig = "kubectl ".repeat(2000); // ~16 KB
5469        let source = "apiVersion: batch/v1\nkind: CronJob\n";
5470
5471        let mut symbol = make_symbol(SymbolKind::Class, "cluster-janitor", 0, 1);
5472        symbol.signature = Some(huge_sig);
5473
5474        let text = build_embed_text(&symbol, source, &file, &project_root);
5475        assert!(
5476            text.chars().count() <= MAX_EMBED_TEXT_CHARS,
5477            "embed_text must be clamped to {} chars, got {}",
5478            MAX_EMBED_TEXT_CHARS,
5479            text.chars().count()
5480        );
5481    }
5482
5483    /// Code symbols (functions, classes, methods, structs, etc.) must still
5484    /// be indexed alongside the heading skip — otherwise we'd starve the
5485    /// index entirely.
5486    #[test]
5487    fn symbols_to_chunks_keeps_code_symbols_alongside_skipped_headings() {
5488        let project_root = PathBuf::from("/proj");
5489        let file = project_root.join("src/lib.rs");
5490        let source = "pub fn handle_request() -> bool {\n    true\n}\n";
5491
5492        let symbols = vec![
5493            // A heading mixed in (e.g. from a doc comment block elsewhere).
5494            make_symbol(SymbolKind::Heading, "doc heading", 0, 1),
5495            make_symbol(SymbolKind::Function, "handle_request", 0, 2),
5496            make_symbol(SymbolKind::Struct, "AuthService", 4, 6),
5497        ];
5498
5499        let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
5500        assert_eq!(
5501            chunks.len(),
5502            3,
5503            "Expected file-summary + 2 code chunks (Function + Struct), got {}",
5504            chunks.len()
5505        );
5506        let names: Vec<&str> = chunks.iter().map(|c| c.name.as_str()).collect();
5507        assert!(chunks
5508            .iter()
5509            .any(|chunk| matches!(chunk.kind, SymbolKind::FileSummary)));
5510        assert!(names.contains(&"handle_request"));
5511        assert!(names.contains(&"AuthService"));
5512        assert!(
5513            !names.contains(&"doc heading"),
5514            "Heading symbol leaked into chunks: {names:?}"
5515        );
5516    }
5517
5518    #[test]
5519    fn validate_ssrf_allows_loopback_hostnames() {
5520        // Loopback hostnames are explicitly allowed so self-hosted backends
5521        // (Ollama at http://localhost:11434) work at their default config.
5522        for host in &[
5523            "http://localhost",
5524            "http://localhost:8080",
5525            "http://localhost:11434", // Ollama default
5526            "http://localhost.localdomain",
5527            "http://foo.localhost",
5528        ] {
5529            assert!(
5530                validate_base_url_no_ssrf(host).is_ok(),
5531                "Expected {host} to be allowed (loopback), got: {:?}",
5532                validate_base_url_no_ssrf(host)
5533            );
5534        }
5535    }
5536
5537    #[test]
5538    fn validate_ssrf_allows_loopback_ips() {
5539        // 127.0.0.0/8 is loopback — by definition same-machine and not an
5540        // SSRF target. Allow it so Ollama at http://127.0.0.1:11434 works.
5541        for url in &[
5542            "http://127.0.0.1",
5543            "http://127.0.0.1:11434", // Ollama default
5544            "http://127.0.0.1:8080",
5545            "http://127.1.2.3",
5546        ] {
5547            let result = validate_base_url_no_ssrf(url);
5548            assert!(
5549                result.is_ok(),
5550                "Expected {url} to be allowed (loopback), got: {:?}",
5551                result
5552            );
5553        }
5554    }
5555
5556    #[test]
5557    fn validate_ssrf_rejects_private_non_loopback_ips() {
5558        // Non-loopback private/reserved IPs remain rejected — homelab/intranet
5559        // services on LAN IPs are real SSRF targets even though the user
5560        // configured them. Users who want this can opt in by binding the
5561        // service to a public-routable address.
5562        for url in &[
5563            "http://192.168.1.1",
5564            "http://10.0.0.1",
5565            "http://172.16.0.1",
5566            "http://169.254.169.254",
5567            "http://100.64.0.1",
5568        ] {
5569            let result = validate_base_url_no_ssrf(url);
5570            assert!(
5571                result.is_err(),
5572                "Expected {url} to be rejected (non-loopback private), got: {:?}",
5573                result
5574            );
5575        }
5576    }
5577
5578    #[test]
5579    fn validate_ssrf_rejects_mdns_local_hostnames() {
5580        // mDNS .local hostnames typically resolve to LAN devices, not
5581        // loopback. Rejecting them before DNS lookup gives a clearer error.
5582        for host in &[
5583            "http://printer.local",
5584            "http://nas.local:8080",
5585            "http://homelab.local",
5586        ] {
5587            let result = validate_base_url_no_ssrf(host);
5588            assert!(
5589                result.is_err(),
5590                "Expected {host} to be rejected (mDNS), got: {:?}",
5591                result
5592            );
5593        }
5594    }
5595
5596    #[test]
5597    fn normalize_base_url_allows_localhost_for_tests() {
5598        // normalize_base_url itself should NOT block localhost — only
5599        // validate_base_url_no_ssrf does. Tests construct backends directly.
5600        assert!(normalize_base_url("http://127.0.0.1:9999").is_ok());
5601        assert!(normalize_base_url("http://localhost:8080").is_ok());
5602    }
5603
5604    #[test]
5605    fn ssrf_guard_blocks_reserved_ranges_but_allows_loopback() {
5606        use std::net::IpAddr;
5607        let blocked = |s: &str| is_private_non_loopback_ip(&s.parse::<IpAddr>().unwrap());
5608
5609        // Private / link-local / CGNAT — blocked (unchanged behavior).
5610        assert!(blocked("10.0.0.1"));
5611        assert!(blocked("192.168.1.1"));
5612        assert!(blocked("169.254.0.1"));
5613        assert!(blocked("100.64.0.1"));
5614        // Newly covered by delegating to url_fetch's complete list:
5615        assert!(
5616            blocked("198.18.0.1"),
5617            "RFC2544 benchmark range must be blocked"
5618        );
5619        assert!(blocked("224.0.0.1"), "multicast must be blocked");
5620        assert!(blocked("fc00::1"), "IPv6 ULA must be blocked");
5621        assert!(blocked("fe80::1"), "IPv6 link-local must be blocked");
5622
5623        // Loopback — allowed (local Ollama endpoint), incl. IPv4-mapped form.
5624        assert!(!blocked("127.0.0.1"), "loopback must stay allowed");
5625        assert!(!blocked("::1"), "IPv6 loopback must stay allowed");
5626        assert!(
5627            !blocked("::ffff:127.0.0.1"),
5628            "IPv4-mapped loopback must stay allowed (matches prior carve-out)"
5629        );
5630
5631        // A public address must NOT be flagged.
5632        assert!(!blocked("8.8.8.8"));
5633    }
5634
5635    /// Pin the user-facing wording of the ONNX version-mismatch error.
5636    /// The auto-fix path MUST be listed first because it's the only safe
5637    /// option that doesn't require sudo or risk breaking other apps that
5638    /// link the system library. Regression of any of these strings would
5639    /// either mislead users (system rm before auto-fix) or break the
5640    /// `aft doctor --fix` discovery path.
5641    #[test]
5642    fn ort_mismatch_message_recommends_auto_fix_first() {
5643        let msg =
5644            format_ort_version_mismatch("1.9.0", "/usr/lib/x86_64-linux-gnu/libonnxruntime.so");
5645
5646        // The reported version and path must appear verbatim.
5647        assert!(
5648            msg.contains("v1.9.0"),
5649            "should report detected version: {msg}"
5650        );
5651        assert!(
5652            msg.contains("/usr/lib/x86_64-linux-gnu/libonnxruntime.so"),
5653            "should report system path: {msg}"
5654        );
5655        assert!(msg.contains("v1.20+"), "should state requirement: {msg}");
5656
5657        // Solution ordering: auto-fix is #1, system rm is #2, install is #3.
5658        let auto_fix_pos = msg
5659            .find("Auto-fix")
5660            .expect("Auto-fix solution missing — users won't discover --fix");
5661        let remove_pos = msg
5662            .find("Remove the old library")
5663            .expect("system-rm solution missing");
5664        assert!(
5665            auto_fix_pos < remove_pos,
5666            "Auto-fix must come before manual rm — see PR comment thread"
5667        );
5668
5669        // The auto-fix command must be runnable as-is on a fresh system.
5670        assert!(
5671            msg.contains("npx @cortexkit/aft doctor --fix"),
5672            "auto-fix command must be present and copy-pasteable: {msg}"
5673        );
5674    }
5675
5676    #[cfg(any(target_os = "linux", target_os = "macos"))]
5677    #[test]
5678    fn loaded_ort_version_detection_prefers_actual_loaded_library_path() {
5679        let requested = "libonnxruntime.so";
5680        let actual = "/usr/local/lib/libonnxruntime.so.1.19.0";
5681
5682        assert_eq!(detect_ort_version_from_path(requested), None);
5683        let (version, source) =
5684            detect_ort_version_from_resolved_or_requested(Some(actual.to_string()), requested);
5685
5686        assert_eq!(version, Some("1.19.0".to_string()));
5687        assert_eq!(source, actual);
5688
5689        let msg = format_ort_version_mismatch(&version.unwrap(), &source);
5690        assert!(msg.contains("v1.19.0"));
5691        assert!(msg.contains(actual));
5692    }
5693
5694    /// macOS dylib paths must not produce a malformed message when the
5695    /// system path lacks a trailing slash. This is a regression guard
5696    /// for the "{}\n{}" format string contract.
5697    #[test]
5698    fn ort_mismatch_message_handles_macos_dylib_path() {
5699        let msg = format_ort_version_mismatch("1.9.0", "/opt/homebrew/lib/libonnxruntime.dylib");
5700        assert!(msg.contains("v1.9.0"));
5701        assert!(msg.contains("/opt/homebrew/lib/libonnxruntime.dylib"));
5702        // The dylib path must appear in the auto-fix paragraph (single
5703        // quotes around it) AND in the manual-rm paragraph; verify
5704        // both placements survived the format string.
5705        assert!(
5706            msg.contains("'/opt/homebrew/lib/libonnxruntime.dylib'"),
5707            "system path should be quoted in the auto-fix sentence: {msg}"
5708        );
5709    }
5710}