Skip to main content

aft/
semantic_index.rs

1use crate::cache_freshness::{self, FileFreshness, FreshnessVerdict};
2use crate::config::{SemanticBackend, SemanticBackendConfig};
3use crate::fs_lock;
4use crate::parser::{detect_language, extract_symbols_from_tree, grammar_for};
5use crate::search_index::{cache_relative_path, cached_path_under_root};
6use crate::symbols::{Symbol, SymbolKind};
7use crate::{slog_info, slog_warn};
8
9use crate::local_embed::LocalEmbedder;
10use rayon::prelude::*;
11use reqwest::blocking::Client;
12use serde::{Deserialize, Serialize};
13use std::collections::{HashMap, HashSet, VecDeque};
14use std::env;
15use std::fmt::Display;
16use std::fs;
17use std::path::{Path, PathBuf};
18use std::sync::Mutex;
19use std::time::Duration;
20use std::time::SystemTime;
21use tree_sitter::Parser;
22use url::Url;
23
24const DEFAULT_DIMENSION: usize = 384;
25const MAX_ENTRIES: usize = 1_000_000;
26// Covers high-dimensional backends such as OpenAI text-embedding-3-large (3072)
27// and common local models (4096) while keeping a bounded supported shape.
28const MAX_DIMENSION: usize = 4096;
29const F32_BYTES: usize = std::mem::size_of::<f32>();
30const HEADER_BYTES_V1: usize = 9;
31const HEADER_BYTES_V2: usize = 13;
32const ONNX_RUNTIME_INSTALL_HINT: &str =
33    "ONNX Runtime not found. Install via: brew install onnxruntime (macOS), \
34     apt install libonnxruntime (Linux), or place onnxruntime.dll in your PATH (Windows). \
35     AFT can auto-download ONNX Runtime — run `npx @cortexkit/aft doctor` to diagnose.";
36
37const SEMANTIC_INDEX_VERSION_V1: u8 = 1;
38const SEMANTIC_INDEX_VERSION_V2: u8 = 2;
39/// V3 adds subsec_nanos to the file-mtime table so staleness detection survives
40/// restart round-trips on filesystems with subsecond mtime precision (APFS,
41/// ext4 with nsec, NTFS). V1/V2 persisted whole-second mtimes only, which
42/// caused every restart to flag ~99% of files as stale and re-embed them.
43const SEMANTIC_INDEX_VERSION_V3: u8 = 3;
44/// V4 keeps the V3 on-disk layout but rebuilds persisted snippets once after
45/// fixing symbol ranges that were incorrectly treated as 1-based.
46const SEMANTIC_INDEX_VERSION_V4: u8 = 4;
47/// V5 adds file sizes to the file metadata table so incremental staleness
48/// detection can catch content changes even when mtime precision misses them.
49const SEMANTIC_INDEX_VERSION_V5: u8 = 5;
50/// V6 stores paths relative to project_root and adds content hashes.
51const SEMANTIC_INDEX_VERSION_V6: u8 = 6;
52const DEFAULT_OPENAI_EMBEDDING_PATH: &str = "/embeddings";
53const DEFAULT_OLLAMA_EMBEDDING_PATH: &str = "/api/embed";
54// Must stay below the bridge timeout (30s) to avoid bridge kills on slow backends.
55const DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS: u64 = 25_000;
56const DEFAULT_MAX_BATCH_SIZE: usize = 64;
57const QUERY_EMBEDDING_CACHE_CAP: usize = 1_000;
58const FALLBACK_BACKEND: &str = "none";
59const EMBEDDING_REQUEST_MAX_ATTEMPTS: usize = 3;
60const EMBEDDING_REQUEST_BACKOFF_MS: [u64; 2] = [500, 1_000];
61static SEMANTIC_LOCK_ACQUIRE_MUTEX: Mutex<()> = Mutex::new(());
62
63pub struct SemanticIndexLock {
64    _guard: fs_lock::LockGuard,
65}
66
67impl SemanticIndexLock {
68    pub fn acquire(storage_dir: &Path, project_key: &str) -> std::io::Result<Self> {
69        let dir = storage_dir.join("semantic").join(project_key);
70        fs::create_dir_all(&dir)?;
71        let path = dir.join("cache.lock");
72        let _acquire_guard = SEMANTIC_LOCK_ACQUIRE_MUTEX
73            .lock()
74            .map_err(|_| std::io::Error::other("semantic cache lock acquisition mutex poisoned"))?;
75        fs_lock::try_acquire(&path, Duration::from_secs(2))
76            .map(|guard| Self { _guard: guard })
77            .map_err(|error| match error {
78                fs_lock::AcquireError::Timeout => {
79                    std::io::Error::other("timed out acquiring semantic cache lock")
80                }
81                fs_lock::AcquireError::Io(error) => error,
82            })
83    }
84}
85
86#[derive(Debug, Clone, Serialize, Deserialize)]
87pub struct SemanticIndexFingerprint {
88    pub backend: String,
89    pub model: String,
90    #[serde(default)]
91    pub base_url: String,
92    pub dimension: usize,
93    #[serde(default = "default_chunking_version")]
94    pub chunking_version: u32,
95}
96
97fn default_chunking_version() -> u32 {
98    2
99}
100
101impl SemanticIndexFingerprint {
102    fn from_config(config: &SemanticBackendConfig, dimension: usize) -> Self {
103        // Use normalized URL for fingerprinting so cosmetic differences
104        // (e.g. "http://host/v1" vs "http://host/v1/") don't cause rebuilds.
105        let base_url = config
106            .base_url
107            .as_ref()
108            .and_then(|u| normalize_base_url(u).ok())
109            .unwrap_or_else(|| FALLBACK_BACKEND.to_string());
110        Self {
111            backend: config.backend.as_str().to_string(),
112            model: config.model.clone(),
113            base_url,
114            dimension,
115            chunking_version: default_chunking_version(),
116        }
117    }
118
119    pub fn as_string(&self) -> String {
120        serde_json::to_string(self).unwrap_or_else(|_| String::new())
121    }
122
123    fn matches_expected(&self, expected: &str) -> bool {
124        let encoded = self.as_string();
125        !encoded.is_empty() && encoded == expected
126    }
127}
128
129enum SemanticEmbeddingEngine {
130    /// Local ONNX embedder (all-MiniLM-L6-v2 via raw `ort`). The config-facing
131    /// backend string stays "fastembed" for index-fingerprint compatibility.
132    Local(LocalEmbedder),
133    OpenAiCompatible {
134        client: Client,
135        model: String,
136        base_url: String,
137        api_key: Option<String>,
138    },
139    Ollama {
140        client: Client,
141        model: String,
142        base_url: String,
143    },
144}
145
146pub struct SemanticEmbeddingModel {
147    backend: SemanticBackend,
148    model: String,
149    base_url: Option<String>,
150    timeout_ms: u64,
151    max_batch_size: usize,
152    dimension: Option<usize>,
153    engine: SemanticEmbeddingEngine,
154    query_embedding_cache: HashMap<String, Vec<f32>>,
155    query_embedding_cache_order: VecDeque<String>,
156    query_embedding_cache_hits: u64,
157    query_embedding_cache_misses: u64,
158}
159
160pub type EmbeddingModel = SemanticEmbeddingModel;
161
162fn validate_embedding_batch(
163    vectors: &[Vec<f32>],
164    expected_count: usize,
165    context: &str,
166) -> Result<(), String> {
167    if expected_count > 0 && vectors.is_empty() {
168        return Err(format!(
169            "{context} returned no vectors for {expected_count} inputs"
170        ));
171    }
172
173    if vectors.len() != expected_count {
174        return Err(format!(
175            "{context} returned {} vectors for {} inputs",
176            vectors.len(),
177            expected_count
178        ));
179    }
180
181    let Some(first_vector) = vectors.first() else {
182        return Ok(());
183    };
184    let expected_dimension = first_vector.len();
185    validate_embedding_dimension(expected_dimension)
186        .map_err(|error| format!("{context} returned {error}"))?;
187    for (index, vector) in vectors.iter().enumerate() {
188        if vector.len() != expected_dimension {
189            return Err(format!(
190                "{context} returned inconsistent embedding dimensions: vector 0 has length {expected_dimension}, vector {index} has length {}",
191                vector.len()
192            ));
193        }
194    }
195
196    Ok(())
197}
198
199fn validate_embedding_dimension(dimension: usize) -> Result<(), String> {
200    if dimension == 0 || dimension > MAX_DIMENSION {
201        return Err(format!(
202            "invalid embedding dimension: {dimension}; supported range is 1..={MAX_DIMENSION}"
203        ));
204    }
205
206    Ok(())
207}
208
209/// Normalize a base URL: validate scheme and strip trailing slash.
210/// Does NOT perform SSRF/private-IP validation — call
211/// `validate_base_url_no_ssrf` separately when processing user-supplied config.
212fn normalize_base_url(raw: &str) -> Result<String, String> {
213    let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
214    let scheme = parsed.scheme();
215    if scheme != "http" && scheme != "https" {
216        return Err(format!(
217            "unsupported URL scheme '{}' — only http:// and https:// are allowed",
218            scheme
219        ));
220    }
221    Ok(parsed.to_string().trim_end_matches('/').to_string())
222}
223
224/// Validate that a base URL does not point to a private/loopback address.
225/// Call this on user-supplied config (at configure time) to prevent SSRF.
226/// Not called for programmatically constructed configs (e.g. tests).
227///
228/// **Loopback is allowed.** Self-hosted embedding backends (e.g. Ollama at
229/// `http://127.0.0.1:11434`) are a primary use case for `aft_search`. Loopback
230/// addresses by definition cannot be exploited as SSRF targets — they only
231/// reach services on the same machine. Allowing loopback unblocks Ollama at its
232/// default config without opening up SSRF to LAN/intranet services, which
233/// remain rejected.
234///
235/// **mDNS `.local` is rejected.** mDNS hostnames typically resolve to LAN
236/// devices (printers, homelab servers); rejecting them before DNS lookup keeps
237/// the SSRF guard meaningful for non-loopback private networks.
238pub fn validate_base_url_no_ssrf(raw: &str) -> Result<(), String> {
239    use std::net::{IpAddr, ToSocketAddrs};
240
241    let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
242
243    let host = parsed.host_str().unwrap_or("");
244
245    // Loopback hostnames are explicitly allowed. RFC 6761 mandates that
246    // `localhost` and `*.localhost` resolve to loopback;
247    // `localhost.localdomain` is a historical alias used on some Linux
248    // distros. Self-hosted backends like Ollama use these by default.
249    let is_loopback_host =
250        host == "localhost" || host == "localhost.localdomain" || host.ends_with(".localhost");
251    if is_loopback_host {
252        return Ok(());
253    }
254
255    // mDNS hostnames are typically LAN devices, not loopback. Reject before
256    // DNS lookup so users get a clear error rather than a private-IP error.
257    if host.ends_with(".local") {
258        return Err(format!(
259            "base_url host '{host}' is an mDNS name — only loopback (localhost / 127.0.0.1) and public endpoints are allowed"
260        ));
261    }
262
263    // Resolve the hostname. Reject private/link-local/CGNAT IPs but NOT
264    // loopback (which is by definition same-machine and not an SSRF target).
265    let port = parsed.port_or_known_default().unwrap_or(443);
266    let addr_str = format!("{host}:{port}");
267    let addrs: Vec<IpAddr> = addr_str
268        .to_socket_addrs()
269        .map(|iter| iter.map(|sa| sa.ip()).collect())
270        .unwrap_or_default();
271    for ip in &addrs {
272        if is_private_non_loopback_ip(ip) {
273            return Err(format!(
274                "base_url '{raw}' resolves to a private/reserved IP — only loopback (127.0.0.1) and public endpoints are allowed"
275            ));
276        }
277    }
278
279    Ok(())
280}
281
282/// Returns true for IPv4/IPv6 addresses in private/link-local/CGNAT/wildcard
283/// ranges, EXCLUDING loopback (127.0.0.0/8 and ::1). Loopback is considered
284/// safe for SSRF purposes — see [`validate_base_url_no_ssrf`] for rationale.
285fn is_private_non_loopback_ip(ip: &std::net::IpAddr) -> bool {
286    use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
287    match ip {
288        IpAddr::V4(v4) => {
289            let o = v4.octets();
290            // Note: 127.0.0.0/8 (loopback) is intentionally NOT in this set.
291            // 10.0.0.0/8
292            o[0] == 10
293            // 172.16.0.0/12
294            || (o[0] == 172 && (16..=31).contains(&o[1]))
295            // 192.168.0.0/16
296            || (o[0] == 192 && o[1] == 168)
297            // 169.254.0.0/16 link-local
298            || (o[0] == 169 && o[1] == 254)
299            // 100.64.0.0/10 CGNAT
300            || (o[0] == 100 && (64..=127).contains(&o[1]))
301            // 0.0.0.0/8 wildcard
302            || o[0] == 0
303        }
304        IpAddr::V6(v6) => {
305            // Note: ::1 (loopback) is intentionally NOT in this set.
306            let _ = Ipv6Addr::LOCALHOST; // touch to silence unused-import lints in some builds
307                                         // fe80::/10 link-local
308            (v6.segments()[0] & 0xffc0) == 0xfe80
309            // fc00::/7 unique-local
310            || (v6.segments()[0] & 0xfe00) == 0xfc00
311            // ::ffff:0:0/96 IPv4-mapped — check the embedded IPv4
312            || (v6.segments()[0] == 0 && v6.segments()[1] == 0
313                && v6.segments()[2] == 0 && v6.segments()[3] == 0
314                && v6.segments()[4] == 0 && v6.segments()[5] == 0xffff
315                && {
316                    let [a, b] = v6.segments()[6..8] else { return false; };
317                    let ipv4 = Ipv4Addr::new((a >> 8) as u8, (a & 0xff) as u8, (b >> 8) as u8, (b & 0xff) as u8);
318                    is_private_non_loopback_ip(&IpAddr::V4(ipv4))
319                })
320        }
321    }
322}
323
324fn build_openai_embeddings_endpoint(base_url: &str) -> String {
325    if base_url.ends_with("/v1") {
326        format!("{base_url}{DEFAULT_OPENAI_EMBEDDING_PATH}")
327    } else {
328        format!("{base_url}/v1{}", DEFAULT_OPENAI_EMBEDDING_PATH)
329    }
330}
331
332fn build_ollama_embeddings_endpoint(base_url: &str) -> String {
333    if base_url.ends_with("/api") {
334        format!("{base_url}/embed")
335    } else {
336        format!("{base_url}{DEFAULT_OLLAMA_EMBEDDING_PATH}")
337    }
338}
339
340fn normalize_api_key(value: Option<String>) -> Option<String> {
341    value.and_then(|token| {
342        let token = token.trim();
343        if token.is_empty() {
344            None
345        } else {
346            Some(token.to_string())
347        }
348    })
349}
350
351fn is_retryable_embedding_status(status: reqwest::StatusCode) -> bool {
352    status.is_server_error() || status == reqwest::StatusCode::TOO_MANY_REQUESTS
353}
354
355/// Local backends (LM Studio, Ollama, llama.cpp) can return a 4xx — usually
356/// 400/409 — while a model is loading or was just unloaded. Only narrowly known
357/// local-backend loading/unloaded payloads are classified transient; generic
358/// 4xx bodies that merely mention phrases like "loading model" remain
359/// permanent so misconfigurations do not retry forever.
360fn embedding_response_body_is_transient(status: reqwest::StatusCode, raw: &str) -> bool {
361    if !matches!(
362        status,
363        reqwest::StatusCode::BAD_REQUEST
364            | reqwest::StatusCode::CONFLICT
365            | reqwest::StatusCode::REQUEST_TIMEOUT
366            | reqwest::StatusCode::LOCKED
367            | reqwest::StatusCode::TOO_EARLY
368    ) {
369        return false;
370    }
371
372    let lower = raw.to_ascii_lowercase();
373    let normalized = lower.trim();
374
375    normalized.contains("model was unloaded while the request was still in queue")
376        || normalized == "model is loading"
377        || normalized.starts_with("model is loading,")
378        || normalized.contains(r#""error":"model is loading"#)
379        || normalized.contains(r#""message":"model is loading"#)
380        || normalized == "model not loaded"
381        || normalized.contains(r#""error":"model not loaded""#)
382        || normalized.contains(r#""message":"model not loaded""#)
383        || normalized == "loading model into memory"
384        || normalized.contains(r#""error":"loading model into memory""#)
385        || normalized.contains(r#""message":"loading model into memory""#)
386        || normalized == "model is being loaded"
387        || normalized.contains(r#""error":"model is being loaded""#)
388        || normalized.contains(r#""message":"model is being loaded""#)
389        || normalized == "model is currently loading"
390        || normalized.contains(r#""error":"model is currently loading""#)
391        || normalized.contains(r#""message":"model is currently loading""#)
392}
393
394fn is_retryable_embedding_error(error: &reqwest::Error) -> bool {
395    error.is_connect()
396}
397
398/// Whether a send-time error means the backend is *unreachable or temporarily
399/// failing* (vs. a real misconfiguration). Broader than the in-request retry
400/// predicate: a per-request timeout is transient for the build/refresh layer
401/// (the model may still be cold-loading) but we don't burn the 3 fast
402/// in-request attempts on it — the build-level retry rides it out instead.
403fn embedding_send_error_is_transient(error: &reqwest::Error) -> bool {
404    error.is_connect() || error.is_timeout()
405}
406
407fn embedding_response_read_error_is_transient(error: &reqwest::Error) -> bool {
408    embedding_send_error_is_transient(error) || error.is_body() || error.is_decode()
409}
410
411/// Stable machine marker prefixed onto embedding error strings whose root cause
412/// is transient — the backend is down, timing out, or returning 5xx/429, not
413/// misconfigured. The build and corpus-refresh layers key retry-vs-give-up on
414/// this marker (see [`embedding_failure_is_transient`]) instead of re-parsing
415/// error text, so transience stays authoritative at the one site that knows it.
416/// Stripped before any user-facing display via [`strip_transient_embedding_marker`].
417pub const TRANSIENT_EMBEDDING_MARKER: &str = "[transient] ";
418
419/// True when an embedding error carries the transient marker — i.e. retrying
420/// once the backend recovers is the right move, not surfacing a hard failure.
421pub fn embedding_failure_is_transient(error: &str) -> bool {
422    error.contains(TRANSIENT_EMBEDDING_MARKER)
423}
424
425/// Remove the machine transient marker so the message is clean for display.
426pub fn strip_transient_embedding_marker(error: &str) -> String {
427    error.replace(TRANSIENT_EMBEDDING_MARKER, "")
428}
429
430fn sleep_before_embedding_retry(attempt_index: usize) {
431    if let Some(delay_ms) = EMBEDDING_REQUEST_BACKOFF_MS.get(attempt_index) {
432        std::thread::sleep(Duration::from_millis(*delay_ms));
433    }
434}
435
436fn send_embedding_request<F>(mut make_request: F, backend_label: &str) -> Result<String, String>
437where
438    F: FnMut() -> reqwest::blocking::RequestBuilder,
439{
440    for attempt_index in 0..EMBEDDING_REQUEST_MAX_ATTEMPTS {
441        let last_attempt = attempt_index + 1 == EMBEDDING_REQUEST_MAX_ATTEMPTS;
442
443        let response = match make_request().send() {
444            Ok(response) => response,
445            Err(error) => {
446                if !last_attempt && is_retryable_embedding_error(&error) {
447                    sleep_before_embedding_retry(attempt_index);
448                    continue;
449                }
450                // Connect/timeout failures mean the backend is unreachable or
451                // cold-loading — mark transient so the build layer rides it out
452                // and self-heals instead of parking the index in `Failed`.
453                let marker = if embedding_send_error_is_transient(&error) {
454                    TRANSIENT_EMBEDDING_MARKER
455                } else {
456                    ""
457                };
458                return Err(format!("{marker}{backend_label} request failed: {error}"));
459            }
460        };
461
462        let status = response.status();
463        let raw = match response.text() {
464            Ok(raw) => raw,
465            Err(error) => {
466                if !last_attempt && embedding_response_read_error_is_transient(&error) {
467                    sleep_before_embedding_retry(attempt_index);
468                    continue;
469                }
470                let marker = if embedding_response_read_error_is_transient(&error) {
471                    TRANSIENT_EMBEDDING_MARKER
472                } else {
473                    ""
474                };
475                return Err(format!(
476                    "{marker}{backend_label} response read failed: {error}"
477                ));
478            }
479        };
480
481        if status.is_success() {
482            return Ok(raw);
483        }
484
485        // A 4xx whose body says the model is loading/unloaded is transient on
486        // local backends (LM Studio/Ollama), so treat it like a retryable
487        // status: ride it out at both the in-request and build-retry layers.
488        let body_transient = embedding_response_body_is_transient(status, &raw);
489        if !last_attempt && (is_retryable_embedding_status(status) || body_transient) {
490            sleep_before_embedding_retry(attempt_index);
491            continue;
492        }
493
494        // 5xx / 429 are server-side and transient — the backend is overloaded
495        // or briefly unavailable, not misconfigured. A 4xx whose body indicates
496        // the model is (un)loading is also transient (local backend mid-swap).
497        // Other 4xx (auth, bad request, model-not-found) is a real error the
498        // user must fix; no marker.
499        let marker = if is_retryable_embedding_status(status) || body_transient {
500            TRANSIENT_EMBEDDING_MARKER
501        } else {
502            ""
503        };
504        return Err(format!(
505            "{marker}{backend_label} request failed (HTTP {}): {}",
506            status, raw
507        ));
508    }
509
510    unreachable!("embedding request retries exhausted without returning")
511}
512
513impl SemanticEmbeddingModel {
514    pub fn from_config(config: &SemanticBackendConfig) -> Result<Self, String> {
515        let timeout_ms = if config.timeout_ms == 0 {
516            DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS
517        } else {
518            config.timeout_ms
519        };
520
521        let max_batch_size = if config.max_batch_size == 0 {
522            DEFAULT_MAX_BATCH_SIZE
523        } else {
524            config.max_batch_size
525        };
526
527        let api_key_env = normalize_api_key(config.api_key_env.clone());
528        let model = config.model.clone();
529
530        let client = Client::builder()
531            .timeout(Duration::from_millis(timeout_ms))
532            .redirect(reqwest::redirect::Policy::none())
533            .build()
534            .map_err(|error| format!("failed to configure embedding client: {error}"))?;
535
536        let engine = match config.backend {
537            SemanticBackend::Fastembed => {
538                SemanticEmbeddingEngine::Local(LocalEmbedder::new(&model)?)
539            }
540            SemanticBackend::OpenAiCompatible => {
541                let raw = config.base_url.as_ref().ok_or_else(|| {
542                    "base_url is required for openai_compatible backend".to_string()
543                })?;
544                let base_url = normalize_base_url(raw)?;
545
546                let api_key = match api_key_env {
547                    Some(var_name) => Some(env::var(&var_name).map_err(|_| {
548                        format!("missing api_key_env '{var_name}' for openai_compatible backend")
549                    })?),
550                    None => None,
551                };
552
553                SemanticEmbeddingEngine::OpenAiCompatible {
554                    client,
555                    model,
556                    base_url,
557                    api_key,
558                }
559            }
560            SemanticBackend::Ollama => {
561                let raw = config
562                    .base_url
563                    .as_ref()
564                    .ok_or_else(|| "base_url is required for ollama backend".to_string())?;
565                let base_url = normalize_base_url(raw)?;
566
567                SemanticEmbeddingEngine::Ollama {
568                    client,
569                    model,
570                    base_url,
571                }
572            }
573        };
574
575        Ok(Self {
576            backend: config.backend,
577            model: config.model.clone(),
578            base_url: config.base_url.clone(),
579            timeout_ms,
580            max_batch_size,
581            dimension: None,
582            engine,
583            query_embedding_cache: HashMap::new(),
584            query_embedding_cache_order: VecDeque::new(),
585            query_embedding_cache_hits: 0,
586            query_embedding_cache_misses: 0,
587        })
588    }
589
590    pub fn backend(&self) -> SemanticBackend {
591        self.backend
592    }
593
594    pub fn model(&self) -> &str {
595        &self.model
596    }
597
598    pub fn base_url(&self) -> Option<&str> {
599        self.base_url.as_deref()
600    }
601
602    pub fn max_batch_size(&self) -> usize {
603        self.max_batch_size
604    }
605
606    pub fn timeout_ms(&self) -> u64 {
607        self.timeout_ms
608    }
609
610    pub fn fingerprint(
611        &mut self,
612        config: &SemanticBackendConfig,
613    ) -> Result<SemanticIndexFingerprint, String> {
614        let dimension = self.dimension()?;
615        Ok(SemanticIndexFingerprint::from_config(config, dimension))
616    }
617
618    pub fn dimension(&mut self) -> Result<usize, String> {
619        if let Some(dimension) = self.dimension {
620            return Ok(dimension);
621        }
622
623        let dimension = match &mut self.engine {
624            SemanticEmbeddingEngine::Local(model) => {
625                let vectors = model.embed(&["semantic index fingerprint probe".to_string()])?;
626                vectors
627                    .first()
628                    .map(|v| v.len())
629                    .ok_or_else(|| "embedding backend returned no vectors".to_string())?
630            }
631            SemanticEmbeddingEngine::OpenAiCompatible { .. } => {
632                let vectors =
633                    self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
634                vectors
635                    .first()
636                    .map(|v| v.len())
637                    .ok_or_else(|| "embedding backend returned no vectors".to_string())?
638            }
639            SemanticEmbeddingEngine::Ollama { .. } => {
640                let vectors =
641                    self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
642                vectors
643                    .first()
644                    .map(|v| v.len())
645                    .ok_or_else(|| "embedding backend returned no vectors".to_string())?
646            }
647        };
648
649        self.dimension = Some(dimension);
650        Ok(dimension)
651    }
652
653    pub fn embed(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
654        self.embed_texts(texts)
655    }
656
657    pub fn embed_query_cached(&mut self, query: &str) -> Result<Vec<f32>, String> {
658        if let Some(vector) = self.query_embedding_cache.get(query) {
659            self.query_embedding_cache_hits += 1;
660            return Ok(vector.clone());
661        }
662
663        self.query_embedding_cache_misses += 1;
664        let embeddings = self.embed_texts(vec![query.to_string()])?;
665        let vector = embeddings
666            .first()
667            .cloned()
668            .ok_or_else(|| "embedding model returned no query vector".to_string())?;
669
670        if self.query_embedding_cache.len() >= QUERY_EMBEDDING_CACHE_CAP {
671            if let Some(oldest) = self.query_embedding_cache_order.pop_front() {
672                self.query_embedding_cache.remove(&oldest);
673            }
674        }
675        self.query_embedding_cache
676            .insert(query.to_string(), vector.clone());
677        self.query_embedding_cache_order
678            .push_back(query.to_string());
679
680        Ok(vector)
681    }
682
683    pub fn query_embedding_cache_stats(&self) -> (u64, u64, usize) {
684        (
685            self.query_embedding_cache_hits,
686            self.query_embedding_cache_misses,
687            self.query_embedding_cache.len(),
688        )
689    }
690
691    fn embed_texts(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
692        match &mut self.engine {
693            SemanticEmbeddingEngine::Local(model) => model
694                .embed(&texts)
695                .map_err(|error| format!("failed to embed batch: {error}")),
696            SemanticEmbeddingEngine::OpenAiCompatible {
697                client,
698                model,
699                base_url,
700                api_key,
701            } => {
702                let expected_text_count = texts.len();
703                let endpoint = build_openai_embeddings_endpoint(base_url);
704                let body = serde_json::json!({
705                    "input": texts,
706                    "model": model,
707                });
708
709                let raw = send_embedding_request(
710                    || {
711                        // `.json(&body)` sets Content-Type: application/json
712                        // automatically. Do NOT add `.header("Content-Type",
713                        // "application/json")` afterwards — RequestBuilder::header()
714                        // calls HeaderMap::append, which produces TWO Content-Type
715                        // headers on the wire. OpenAI's /v1/embeddings endpoint
716                        // treats duplicate Content-Type as malformed and rejects
717                        // the body with 400 "you must provide a model parameter"
718                        // even when `model` is set. Verified end-to-end against
719                        // api.openai.com. See issue #36.
720                        let mut request = client.post(&endpoint).json(&body);
721
722                        if let Some(api_key) = api_key {
723                            request = request.header("Authorization", format!("Bearer {api_key}"));
724                        }
725
726                        request
727                    },
728                    "openai compatible",
729                )?;
730
731                #[derive(Deserialize)]
732                struct OpenAiResponse {
733                    data: Vec<OpenAiEmbeddingResult>,
734                }
735
736                #[derive(Deserialize)]
737                struct OpenAiEmbeddingResult {
738                    embedding: Vec<f32>,
739                    index: Option<u32>,
740                }
741
742                let parsed: OpenAiResponse = serde_json::from_str(&raw)
743                    .map_err(|error| format!("invalid openai compatible response: {error}"))?;
744                if parsed.data.len() != expected_text_count {
745                    return Err(format!(
746                        "openai compatible response returned {} embeddings for {} inputs",
747                        parsed.data.len(),
748                        expected_text_count
749                    ));
750                }
751
752                let mut vectors = vec![Vec::new(); parsed.data.len()];
753                for (i, item) in parsed.data.into_iter().enumerate() {
754                    let index = item.index.unwrap_or(i as u32) as usize;
755                    if index >= vectors.len() {
756                        return Err(
757                            "openai compatible response contains invalid vector index".to_string()
758                        );
759                    }
760                    vectors[index] = item.embedding;
761                }
762
763                for vector in &vectors {
764                    if vector.is_empty() {
765                        return Err(
766                            "openai compatible response contained missing vectors".to_string()
767                        );
768                    }
769                }
770
771                self.dimension = vectors.first().map(Vec::len);
772                Ok(vectors)
773            }
774            SemanticEmbeddingEngine::Ollama {
775                client,
776                model,
777                base_url,
778            } => {
779                let expected_text_count = texts.len();
780                let endpoint = build_ollama_embeddings_endpoint(base_url);
781
782                #[derive(Serialize)]
783                struct OllamaPayload<'a> {
784                    model: &'a str,
785                    input: Vec<String>,
786                }
787
788                let payload = OllamaPayload {
789                    model,
790                    input: texts,
791                };
792
793                let raw = send_embedding_request(
794                    || {
795                        // `.json(&payload)` sets Content-Type automatically.
796                        // Same duplicate-header trap as the OpenAI branch above
797                        // — most Ollama servers tolerate it, but the
798                        // single-Content-Type form is the correct one.
799                        client.post(&endpoint).json(&payload)
800                    },
801                    "ollama",
802                )?;
803
804                #[derive(Deserialize)]
805                struct OllamaResponse {
806                    embeddings: Vec<Vec<f32>>,
807                }
808
809                let parsed: OllamaResponse = serde_json::from_str(&raw)
810                    .map_err(|error| format!("invalid ollama response: {error}"))?;
811                if parsed.embeddings.is_empty() {
812                    return Err("ollama response returned no embeddings".to_string());
813                }
814                if parsed.embeddings.len() != expected_text_count {
815                    return Err(format!(
816                        "ollama response returned {} embeddings for {} inputs",
817                        parsed.embeddings.len(),
818                        expected_text_count
819                    ));
820                }
821
822                let vectors = parsed.embeddings;
823                for vector in &vectors {
824                    if vector.is_empty() {
825                        return Err("ollama response contained empty embeddings".to_string());
826                    }
827                }
828
829                self.dimension = vectors.first().map(Vec::len);
830                Ok(vectors)
831            }
832        }
833    }
834}
835
836/// Pre-validate ONNX Runtime by attempting a raw dlopen before ort touches it.
837/// This catches broken/incompatible .so files without risking a panic in the ort crate.
838/// Also checks the runtime version via OrtGetApiBase if available.
839pub fn pre_validate_onnx_runtime() -> Result<(), String> {
840    let dylib_path = std::env::var("ORT_DYLIB_PATH").ok();
841
842    #[cfg(any(target_os = "linux", target_os = "macos"))]
843    {
844        #[cfg(target_os = "linux")]
845        let default_name = "libonnxruntime.so";
846        #[cfg(target_os = "macos")]
847        let default_name = "libonnxruntime.dylib";
848
849        let lib_name = dylib_path.as_deref().unwrap_or(default_name);
850
851        unsafe {
852            let c_name = std::ffi::CString::new(lib_name)
853                .map_err(|e| format!("invalid library path: {}", e))?;
854            let handle = libc::dlopen(c_name.as_ptr(), libc::RTLD_NOW);
855            if handle.is_null() {
856                let err = libc::dlerror();
857                let msg = if err.is_null() {
858                    "unknown dlopen error".to_string()
859                } else {
860                    std::ffi::CStr::from_ptr(err).to_string_lossy().into_owned()
861                };
862                return Err(format!(
863                    "ONNX Runtime not found. dlopen('{}') failed: {}. \
864                     Run `npx @cortexkit/aft doctor` to diagnose.",
865                    lib_name, msg
866                ));
867            }
868
869            // Try to detect the runtime version from the actual loaded library
870            // path first. A bare dlopen("libonnxruntime.so") may resolve to an
871            // older system ORT through loader search paths; checking only the
872            // caller-supplied soname would miss that and let ort fail opaquely.
873            let (detected_version, version_source) =
874                detect_ort_version_from_loaded_library(handle, lib_name);
875
876            libc::dlclose(handle);
877
878            // Check version compatibility — we need 1.20+.
879            if let Some(ref version) = detected_version {
880                let parts: Vec<&str> = version.split('.').collect();
881                if let (Some(major), Some(minor)) = (
882                    parts.first().and_then(|s| s.parse::<u32>().ok()),
883                    parts.get(1).and_then(|s| s.parse::<u32>().ok()),
884                ) {
885                    if major != 1 || minor < 20 {
886                        return Err(format_ort_version_mismatch(version, &version_source));
887                    }
888                }
889            }
890        }
891    }
892
893    #[cfg(target_os = "windows")]
894    {
895        // Validate ONNX Runtime availability on Windows by loading the DLL
896        // via LoadLibraryExW before the ort crate attempts its own LoadLibrary.
897        // This way we can produce a friendly error (with installation hints)
898        // instead of a raw LoadLibrary failure from deep inside fastembed.
899        let lib_name = dylib_path.as_deref().unwrap_or("onnxruntime.dll");
900
901        // Use kernel32 LoadLibraryExW for the validation — built-in, no
902        // crate dependency required. GetModuleFileNameW resolves the loaded
903        // DLL path for version probing via the version.dll API.
904        #[link(name = "kernel32")]
905        extern "system" {
906            fn LoadLibraryExW(
907                lpLibFileName: *const u16,
908                hFile: *mut std::ffi::c_void,
909                dwFlags: u32,
910            ) -> *mut std::ffi::c_void;
911            fn FreeLibrary(hLibModule: *mut std::ffi::c_void) -> i32;
912            fn GetModuleFileNameW(
913                hModule: *mut std::ffi::c_void,
914                lpFilename: *mut u16,
915                nSize: u32,
916            ) -> u32;
917        }
918
919        #[link(name = "version")]
920        extern "system" {
921            fn GetFileVersionInfoSizeW(lptstrFilename: *const u16, lpdwHandle: *mut u32) -> u32;
922            fn GetFileVersionInfoW(
923                lptstrFilename: *const u16,
924                dwHandle: u32,
925                dwLen: u32,
926                lpData: *mut std::ffi::c_void,
927            ) -> i32;
928            fn VerQueryValueW(
929                pBlock: *mut std::ffi::c_void,
930                lpSubBlock: *const u16,
931                lplpBuffer: *mut *mut std::ffi::c_void,
932                puLen: *mut u32,
933            ) -> i32;
934        }
935
936        #[repr(C)]
937        struct VS_FIXEDFILEINFO {
938            dw_signature: u32,
939            dw_struc_version: u32,
940            dw_file_version_ms: u32, // HIWORD major, LOWORD minor
941            dw_file_version_ls: u32, // HIWORD build, LOWORD revision
942            dw_product_version_ms: u32,
943            dw_product_version_ls: u32,
944            dw_file_flags_mask: u32,
945            dw_file_flags: u32,
946            dw_file_os: u32,
947            dw_file_type: u32,
948            dw_file_subtype: u32,
949            dw_file_date_ms: u32,
950            dw_file_date_ls: u32,
951        }
952
953        unsafe {
954            use std::os::windows::ffi::OsStrExt;
955            let wide: Vec<u16> = std::ffi::OsStr::new(lib_name)
956                .encode_wide()
957                .chain(std::iter::once(0))
958                .collect();
959
960            let handle = LoadLibraryExW(wide.as_ptr(), std::ptr::null_mut(), 0);
961            if handle.is_null() {
962                let err = std::io::Error::last_os_error();
963                return Err(format!(
964                    "ONNX Runtime not found. LoadLibraryExW('{}') failed: {}. \
965                     Run `npx @cortexkit/aft doctor` to diagnose.",
966                    lib_name, err
967                ));
968            }
969
970            // Probe the file version from PE resources so we can reject
971            // outdated DLLs (e.g. v1.9.x) before the ort crate panics.
972            let mut detected_major: u32 = 0;
973            let mut detected_minor: u32 = 0;
974            // Use MAX_UNICODEPATH (32767) so deeply nested ORT paths (e.g.
975            // long NuGet package paths under %USERPROFILE%) never truncate.
976            // GetModuleFileNameW truncates silently when the buffer is too
977            // small, which causes version probing to fail and the version
978            // check to be bypassed — better to allocate generously.
979            let mut path_buf = [0u16; 32767];
980            let path_len = GetModuleFileNameW(handle, path_buf.as_mut_ptr(), 32767);
981            if path_len > 0 {
982                let mut dummy_handle: u32 = 0;
983                let info_size = GetFileVersionInfoSizeW(path_buf.as_ptr(), &mut dummy_handle);
984                if info_size > 0 {
985                    let mut info = vec![0u8; info_size as usize];
986                    if GetFileVersionInfoW(
987                        path_buf.as_ptr(),
988                        0,
989                        info_size,
990                        info.as_mut_ptr() as *mut std::ffi::c_void,
991                    ) != 0
992                    {
993                        let sub_block = "\\\0".encode_utf16().collect::<Vec<u16>>();
994                        let mut vs_info: *mut std::ffi::c_void = std::ptr::null_mut();
995                        let mut vs_len: u32 = 0;
996                        if VerQueryValueW(
997                            info.as_mut_ptr() as *mut std::ffi::c_void,
998                            sub_block.as_ptr(),
999                            &mut vs_info,
1000                            &mut vs_len,
1001                        ) != 0
1002                            && !vs_info.is_null()
1003                        {
1004                            let fixed = vs_info as *const VS_FIXEDFILEINFO;
1005                            detected_major = (*fixed).dw_file_version_ms >> 16;
1006                            detected_minor = (*fixed).dw_file_version_ms & 0xFFFF;
1007                        }
1008                    }
1009                }
1010            }
1011
1012            FreeLibrary(handle);
1013
1014            // Version compatibility check (mirrors the Linux/macOS path).
1015            // If version could not be detected (detected_major == 0) we let
1016            // the load succeed — the ort crate will diagnose further.
1017            if detected_major != 0 && (detected_major != 1 || detected_minor < 20) {
1018                let ver = format!("{}.{}", detected_major, detected_minor);
1019                return Err(format_ort_version_mismatch(&ver, lib_name));
1020            }
1021        }
1022    }
1023
1024    Ok(())
1025}
1026
1027#[cfg(any(target_os = "linux", target_os = "macos"))]
1028unsafe fn loaded_library_path_from_handle(handle: *mut std::ffi::c_void) -> Option<String> {
1029    let symbol_name = std::ffi::CString::new("OrtGetApiBase").ok()?;
1030    let symbol = unsafe { libc::dlsym(handle, symbol_name.as_ptr()) };
1031    if symbol.is_null() {
1032        return None;
1033    }
1034
1035    let mut info = std::mem::MaybeUninit::<libc::Dl_info>::uninit();
1036    if unsafe { libc::dladdr(symbol, info.as_mut_ptr()) } == 0 {
1037        return None;
1038    }
1039
1040    let info = unsafe { info.assume_init() };
1041    if info.dli_fname.is_null() {
1042        return None;
1043    }
1044
1045    Some(
1046        unsafe { std::ffi::CStr::from_ptr(info.dli_fname) }
1047            .to_string_lossy()
1048            .into_owned(),
1049    )
1050}
1051
1052#[cfg(any(target_os = "linux", target_os = "macos"))]
1053fn detect_ort_version_from_resolved_or_requested(
1054    resolved_path: Option<String>,
1055    requested_lib_name: &str,
1056) -> (Option<String>, String) {
1057    if let Some(path) = resolved_path {
1058        if let Some(version) = detect_ort_version_from_path(&path) {
1059            return (Some(version), path);
1060        }
1061        return (detect_ort_version_from_path(requested_lib_name), path);
1062    }
1063
1064    (
1065        detect_ort_version_from_path(requested_lib_name),
1066        requested_lib_name.to_string(),
1067    )
1068}
1069
1070#[cfg(any(target_os = "linux", target_os = "macos"))]
1071fn detect_ort_version_from_loaded_library(
1072    handle: *mut std::ffi::c_void,
1073    requested_lib_name: &str,
1074) -> (Option<String>, String) {
1075    detect_ort_version_from_resolved_or_requested(
1076        unsafe { loaded_library_path_from_handle(handle) },
1077        requested_lib_name,
1078    )
1079}
1080
1081/// Try to extract the ORT version from the library filename or resolved symlink.
1082/// Examples: "libonnxruntime.so.1.19.0" → "1.19.0", "libonnxruntime.1.24.4.dylib" → "1.24.4"
1083#[cfg(any(target_os = "linux", target_os = "macos"))]
1084fn detect_ort_version_from_path(lib_path: &str) -> Option<String> {
1085    let path = std::path::Path::new(lib_path);
1086
1087    // Try the path as given, then follow symlinks
1088    for candidate in [Some(path.to_path_buf()), std::fs::canonicalize(path).ok()]
1089        .into_iter()
1090        .flatten()
1091    {
1092        if let Some(name) = candidate.file_name().and_then(|n| n.to_str()) {
1093            if let Some(version) = extract_version_from_filename(name) {
1094                return Some(version);
1095            }
1096        }
1097    }
1098
1099    // Also check for versioned siblings in the same directory
1100    if let Some(parent) = path.parent() {
1101        if let Ok(entries) = std::fs::read_dir(parent) {
1102            for entry in entries.flatten() {
1103                if let Some(name) = entry.file_name().to_str() {
1104                    if name.starts_with("libonnxruntime") {
1105                        if let Some(version) = extract_version_from_filename(name) {
1106                            return Some(version);
1107                        }
1108                    }
1109                }
1110            }
1111        }
1112    }
1113
1114    None
1115}
1116
1117/// Extract version from filenames like "libonnxruntime.so.1.19.0" or "libonnxruntime.1.24.4.dylib"
1118#[cfg(any(target_os = "linux", target_os = "macos"))]
1119fn extract_version_from_filename(name: &str) -> Option<String> {
1120    // Match patterns: .so.X.Y.Z or .X.Y.Z.dylib or .X.Y.Z.so
1121    let re = regex::Regex::new(r"(\d+\.\d+\.\d+)").ok()?;
1122    re.find(name).map(|m| m.as_str().to_string())
1123}
1124
1125fn suggest_removal_command(lib_path: &str) -> String {
1126    if lib_path.starts_with("/usr/local/lib")
1127        || lib_path == "libonnxruntime.so"
1128        || lib_path == "libonnxruntime.dylib"
1129    {
1130        #[cfg(target_os = "linux")]
1131        return "   sudo rm /usr/local/lib/libonnxruntime* && sudo ldconfig".to_string();
1132        #[cfg(target_os = "macos")]
1133        return "   sudo rm /usr/local/lib/libonnxruntime*".to_string();
1134    }
1135    format!("   rm '{}'", lib_path)
1136}
1137
1138/// Build the user-facing error message for an incompatible ONNX Runtime
1139/// install. Extracted as a pure helper so we can unit-test the wording
1140/// stability — the auto-fix recommendation must always come first because
1141/// it's the only safe option, and the system-rm step must remain present
1142/// because some users prefer the system-wide cleanup path.
1143pub(crate) fn format_ort_version_mismatch(version: &str, lib_name: &str) -> String {
1144    format!(
1145        "ONNX Runtime version mismatch: found v{} at '{}', but AFT requires v1.20+. \
1146         Solutions:\n\
1147         1. Auto-fix (recommended): run `npx @cortexkit/aft doctor --fix`. \
1148         This downloads AFT-managed ONNX Runtime v1.24 into AFT's storage and \
1149         configures the bridge to load it instead of the system library — no \
1150         changes to '{}'.\n\
1151         2. Remove the old library and restart (AFT auto-downloads the correct version on next start):\n\
1152         {}\n\
1153         3. Or install ONNX Runtime 1.24 system-wide: https://github.com/microsoft/onnxruntime/releases/tag/v1.24.0\n\
1154         4. Run `npx @cortexkit/aft doctor` for full diagnostics.",
1155        version,
1156        lib_name,
1157        lib_name,
1158        suggest_removal_command(lib_name),
1159    )
1160}
1161
1162pub fn is_onnx_runtime_unavailable(message: &str) -> bool {
1163    if message.trim_start().starts_with("ONNX Runtime not found.") {
1164        return true;
1165    }
1166
1167    let message = message.to_ascii_lowercase();
1168    let mentions_onnx_runtime = ["onnx runtime", "onnxruntime", "libonnxruntime"]
1169        .iter()
1170        .any(|pattern| message.contains(pattern));
1171    let mentions_dynamic_load_failure = [
1172        "shared library",
1173        "dynamic library",
1174        "failed to load",
1175        "could not load",
1176        "unable to load",
1177        "dlopen",
1178        "loadlibrary",
1179        "no such file",
1180        "not found",
1181    ]
1182    .iter()
1183    .any(|pattern| message.contains(pattern));
1184
1185    mentions_onnx_runtime && mentions_dynamic_load_failure
1186}
1187
1188pub fn format_embedding_init_error(error: impl Display) -> String {
1189    let message = error.to_string();
1190
1191    if is_onnx_runtime_unavailable(&message) {
1192        return format!("{ONNX_RUNTIME_INSTALL_HINT} Original error: {message}");
1193    }
1194
1195    format!("failed to initialize semantic embedding model: {message}")
1196}
1197
1198/// A chunk of code ready for embedding — derived from a Symbol with context enrichment
1199#[derive(Debug, Clone)]
1200pub struct SemanticChunk {
1201    /// Absolute file path
1202    pub file: PathBuf,
1203    /// Symbol name
1204    pub name: String,
1205    /// Symbol kind (function, class, struct, etc.)
1206    pub kind: SymbolKind,
1207    /// Line range (0-based internally, inclusive)
1208    pub start_line: u32,
1209    pub end_line: u32,
1210    /// Whether the symbol is exported
1211    pub exported: bool,
1212    /// The enriched text that gets embedded (scope + signature + body snippet)
1213    pub embed_text: String,
1214    /// Short code snippet for display in results
1215    pub snippet: String,
1216}
1217
1218/// A stored embedding entry — chunk metadata + vector
1219#[derive(Debug, Clone)]
1220pub struct EmbeddingEntry {
1221    chunk: SemanticChunk,
1222    vector: Vec<f32>,
1223}
1224
1225/// The semantic index — stores embeddings for all symbols in a project
1226#[derive(Debug, Clone)]
1227pub struct SemanticIndex {
1228    entries: Vec<EmbeddingEntry>,
1229    /// Track which files are indexed and their mtime for staleness detection
1230    file_mtimes: HashMap<PathBuf, SystemTime>,
1231    /// Track indexed file sizes alongside mtimes for staleness detection
1232    file_sizes: HashMap<PathBuf, u64>,
1233    file_hashes: HashMap<PathBuf, blake3::Hash>,
1234    /// Embedding dimension (384 for MiniLM-L6-v2)
1235    dimension: usize,
1236    fingerprint: Option<SemanticIndexFingerprint>,
1237    project_root: PathBuf,
1238    deferred_files: HashSet<PathBuf>,
1239}
1240
1241#[derive(Debug, Clone, Copy)]
1242struct IndexedFileMetadata {
1243    mtime: SystemTime,
1244    size: u64,
1245    content_hash: blake3::Hash,
1246}
1247
1248/// Result of an incremental refresh of the semantic index. Counts are file
1249/// counts; `total_processed` is the number of current/deleted files considered.
1250#[derive(Debug, Default, Clone, Copy)]
1251pub struct RefreshSummary {
1252    pub changed: usize,
1253    pub added: usize,
1254    pub deleted: usize,
1255    pub total_processed: usize,
1256}
1257
1258impl RefreshSummary {
1259    /// True when no files were touched.
1260    pub fn is_noop(&self) -> bool {
1261        self.changed == 0 && self.added == 0 && self.deleted == 0
1262    }
1263}
1264
1265#[derive(Debug, Default)]
1266pub struct InvalidatedFilesRefresh {
1267    /// Full replacement entries for `completed_paths`, not just newly embedded
1268    /// chunks. `apply_refresh_update` removes completed paths before extending
1269    /// this set, so reused chunks must travel in this delta too.
1270    pub added_entries: Vec<EmbeddingEntry>,
1271    pub updated_metadata: Vec<(PathBuf, FileFreshness)>,
1272    pub completed_paths: Vec<PathBuf>,
1273    pub summary: RefreshSummary,
1274}
1275
1276#[derive(Debug, Clone)]
1277struct ReusableEmbedding {
1278    embed_text: String,
1279    vector: Vec<f32>,
1280}
1281
1282type ChunkReuseMap = HashMap<PathBuf, HashMap<blake3::Hash, Vec<ReusableEmbedding>>>;
1283
1284/// Search result from a semantic query
1285#[derive(Debug, Clone)]
1286pub struct SemanticResult {
1287    pub file: PathBuf,
1288    pub name: String,
1289    pub kind: SymbolKind,
1290    pub start_line: u32,
1291    pub end_line: u32,
1292    pub exported: bool,
1293    pub snippet: String,
1294    pub score: f32,
1295    pub source: &'static str,
1296}
1297
1298impl SemanticIndex {
1299    pub fn new(project_root: PathBuf, dimension: usize) -> Self {
1300        debug_assert!(project_root.is_absolute());
1301        Self {
1302            entries: Vec::new(),
1303            file_mtimes: HashMap::new(),
1304            file_sizes: HashMap::new(),
1305            file_hashes: HashMap::new(),
1306            dimension,
1307            fingerprint: None,
1308            project_root,
1309            deferred_files: HashSet::new(),
1310        }
1311    }
1312
1313    /// Number of embedded symbol entries.
1314    pub fn entry_count(&self) -> usize {
1315        self.entries.len()
1316    }
1317
1318    /// Number of files currently tracked by the semantic index.
1319    pub fn indexed_file_count(&self) -> usize {
1320        self.file_mtimes.len()
1321    }
1322
1323    /// Human-readable status label for the index.
1324    pub fn status_label(&self) -> &'static str {
1325        if self.entries.is_empty() {
1326            "empty"
1327        } else {
1328            "ready"
1329        }
1330    }
1331
1332    fn collect_chunks(
1333        project_root: &Path,
1334        files: &[PathBuf],
1335    ) -> (Vec<SemanticChunk>, HashMap<PathBuf, IndexedFileMetadata>) {
1336        let collect_started = std::time::Instant::now();
1337        let per_file: Vec<(
1338            PathBuf,
1339            Result<(IndexedFileMetadata, Vec<SemanticChunk>), String>,
1340        )> = files
1341            .par_iter()
1342            .map_init(HashMap::new, |parsers, file| {
1343                let result = collect_file_metadata(file).and_then(|metadata| {
1344                    collect_file_chunks(project_root, file, parsers)
1345                        .map(|chunks| (metadata, chunks))
1346                });
1347                (file.clone(), result)
1348            })
1349            .collect();
1350
1351        let mut chunks: Vec<SemanticChunk> = Vec::new();
1352        let mut file_metadata: HashMap<PathBuf, IndexedFileMetadata> = HashMap::new();
1353
1354        for (file, result) in per_file {
1355            match result {
1356                Ok((metadata, file_chunks)) => {
1357                    file_metadata.insert(file, metadata);
1358                    chunks.extend(file_chunks);
1359                }
1360                Err(error) => {
1361                    // "unsupported file extension" is expected for non-code files
1362                    // (json, xml, .gitignore, etc.) that get included in the
1363                    // project walk. Pre-fix this was swallowed by .unwrap_or_default();
1364                    // we now skip silently to keep the log clean. Only real read/parse
1365                    // errors are worth surfacing.
1366                    if error == "unsupported file extension" {
1367                        continue;
1368                    }
1369                    slog_warn!(
1370                        "failed to collect semantic chunks for {}: {}",
1371                        file.display(),
1372                        error
1373                    );
1374                }
1375            }
1376        }
1377
1378        slog_info!(
1379            "semantic collect: {} chunks from {} files in {} ms",
1380            chunks.len(),
1381            file_metadata.len(),
1382            collect_started.elapsed().as_millis()
1383        );
1384
1385        (chunks, file_metadata)
1386    }
1387
1388    fn build_chunk_reuse_map(&self, files: &[PathBuf]) -> ChunkReuseMap {
1389        let requested: HashSet<&Path> = files.iter().map(PathBuf::as_path).collect();
1390        let mut reuse_map: ChunkReuseMap = HashMap::new();
1391
1392        for entry in &self.entries {
1393            if !requested.contains(entry.chunk.file.as_path()) {
1394                continue;
1395            }
1396
1397            // `embed_text` is already persisted in the current on-disk format,
1398            // so refresh-time reuse can hash it in memory and confirm the exact
1399            // string without bumping `SEMANTIC_INDEX_VERSION` and forcing every
1400            // user through a full rebuild.
1401            let hash = blake3::hash(entry.chunk.embed_text.as_bytes());
1402            reuse_map
1403                .entry(entry.chunk.file.clone())
1404                .or_default()
1405                .entry(hash)
1406                .or_default()
1407                .push(ReusableEmbedding {
1408                    embed_text: entry.chunk.embed_text.clone(),
1409                    vector: entry.vector.clone(),
1410                });
1411        }
1412
1413        reuse_map
1414    }
1415
1416    fn reusable_vector_for_chunk(
1417        reuse_map: &ChunkReuseMap,
1418        chunk: &SemanticChunk,
1419    ) -> Option<Vec<f32>> {
1420        let hash = blake3::hash(chunk.embed_text.as_bytes());
1421        reuse_map
1422            .get(&chunk.file)?
1423            .get(&hash)?
1424            .iter()
1425            .find(|candidate| candidate.embed_text == chunk.embed_text)
1426            .map(|candidate| candidate.vector.clone())
1427    }
1428
1429    fn entries_for_chunks_with_reuse<F, P>(
1430        chunks: Vec<SemanticChunk>,
1431        reuse_map: &ChunkReuseMap,
1432        embed_fn: &mut F,
1433        max_batch_size: usize,
1434        initial_observed_dimension: Option<usize>,
1435        refresh_label: &str,
1436        progress: &mut P,
1437    ) -> Result<(Vec<EmbeddingEntry>, Option<usize>), String>
1438    where
1439        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1440        P: FnMut(usize, usize),
1441    {
1442        let total_chunks = chunks.len();
1443        progress(0, total_chunks);
1444
1445        let mut entries_by_chunk: Vec<Option<EmbeddingEntry>> = vec![None; total_chunks];
1446        let mut misses: Vec<(usize, SemanticChunk)> = Vec::new();
1447
1448        for (chunk_index, chunk) in chunks.into_iter().enumerate() {
1449            if let Some(vector) = Self::reusable_vector_for_chunk(reuse_map, &chunk) {
1450                entries_by_chunk[chunk_index] = Some(EmbeddingEntry { chunk, vector });
1451            } else {
1452                misses.push((chunk_index, chunk));
1453            }
1454        }
1455
1456        let mut completed = total_chunks.saturating_sub(misses.len());
1457        if completed > 0 {
1458            progress(completed, total_chunks);
1459        }
1460
1461        let batch_size = max_batch_size.max(1);
1462        let mut observed_dimension = initial_observed_dimension;
1463
1464        for batch_start in (0..misses.len()).step_by(batch_size) {
1465            let batch_end = (batch_start + batch_size).min(misses.len());
1466            let batch_texts: Vec<String> = misses[batch_start..batch_end]
1467                .iter()
1468                .map(|(_, chunk)| chunk.embed_text.clone())
1469                .collect();
1470
1471            let vectors = embed_fn(batch_texts)?;
1472            validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1473
1474            if let Some(dim) = vectors.first().map(|vector| vector.len()) {
1475                match observed_dimension {
1476                    None => observed_dimension = Some(dim),
1477                    Some(expected) if dim != expected => {
1478                        return Err(format!(
1479                            "embedding dimension changed during {refresh_label}: \
1480                             cached index uses {expected}, new vectors use {dim}"
1481                        ));
1482                    }
1483                    _ => {}
1484                }
1485            }
1486
1487            for (i, vector) in vectors.into_iter().enumerate() {
1488                let (chunk_index, chunk) = misses[batch_start + i].clone();
1489                entries_by_chunk[chunk_index] = Some(EmbeddingEntry { chunk, vector });
1490            }
1491
1492            completed += batch_end - batch_start;
1493            progress(completed, total_chunks);
1494        }
1495
1496        let entries = entries_by_chunk
1497            .into_iter()
1498            .map(|entry| entry.expect("semantic refresh accounted for every chunk"))
1499            .collect();
1500
1501        Ok((entries, observed_dimension))
1502    }
1503
1504    fn build_from_chunks<F, P>(
1505        project_root: &Path,
1506        chunks: Vec<SemanticChunk>,
1507        file_metadata: HashMap<PathBuf, IndexedFileMetadata>,
1508        embed_fn: &mut F,
1509        max_batch_size: usize,
1510        mut progress: Option<&mut P>,
1511    ) -> Result<Self, String>
1512    where
1513        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1514        P: FnMut(usize, usize),
1515    {
1516        debug_assert!(project_root.is_absolute());
1517        let total_chunks = chunks.len();
1518
1519        if chunks.is_empty() {
1520            return Ok(Self {
1521                entries: Vec::new(),
1522                file_mtimes: file_metadata
1523                    .iter()
1524                    .map(|(path, metadata)| (path.clone(), metadata.mtime))
1525                    .collect(),
1526                file_sizes: file_metadata
1527                    .iter()
1528                    .map(|(path, metadata)| (path.clone(), metadata.size))
1529                    .collect(),
1530                file_hashes: file_metadata
1531                    .into_iter()
1532                    .map(|(path, metadata)| (path, metadata.content_hash))
1533                    .collect(),
1534                dimension: DEFAULT_DIMENSION,
1535                fingerprint: None,
1536                project_root: project_root.to_path_buf(),
1537                deferred_files: HashSet::new(),
1538            });
1539        }
1540
1541        // Embed in batches
1542        let mut entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1543        let mut expected_dimension: Option<usize> = None;
1544        let batch_size = max_batch_size.max(1);
1545        let embed_started = std::time::Instant::now();
1546        let batch_count = total_chunks.div_ceil(batch_size);
1547        for batch_start in (0..chunks.len()).step_by(batch_size) {
1548            let batch_end = (batch_start + batch_size).min(chunks.len());
1549            let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1550                .iter()
1551                .map(|c| c.embed_text.clone())
1552                .collect();
1553
1554            let vectors = embed_fn(batch_texts)?;
1555            validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1556
1557            // Track consistent dimension across all batches
1558            if let Some(dim) = vectors.first().map(|v| v.len()) {
1559                match expected_dimension {
1560                    None => expected_dimension = Some(dim),
1561                    Some(expected) if dim != expected => {
1562                        return Err(format!(
1563                            "embedding dimension changed across batches: expected {expected}, got {dim}"
1564                        ));
1565                    }
1566                    _ => {}
1567                }
1568            }
1569
1570            for (i, vector) in vectors.into_iter().enumerate() {
1571                let chunk_idx = batch_start + i;
1572                entries.push(EmbeddingEntry {
1573                    chunk: chunks[chunk_idx].clone(),
1574                    vector,
1575                });
1576            }
1577
1578            if let Some(callback) = progress.as_mut() {
1579                callback(entries.len(), total_chunks);
1580            }
1581        }
1582
1583        let embed_ms = embed_started.elapsed().as_millis();
1584        let rate = (total_chunks as u128 * 1000)
1585            .checked_div(embed_ms)
1586            .unwrap_or(0) as u64;
1587        slog_info!(
1588            "semantic embed: {} chunks in {} batches, {} ms ({} chunks/s)",
1589            total_chunks,
1590            batch_count,
1591            embed_ms,
1592            rate
1593        );
1594
1595        let dimension = entries
1596            .first()
1597            .map(|e| e.vector.len())
1598            .unwrap_or(DEFAULT_DIMENSION);
1599
1600        Ok(Self {
1601            entries,
1602            file_mtimes: file_metadata
1603                .iter()
1604                .map(|(path, metadata)| (path.clone(), metadata.mtime))
1605                .collect(),
1606            file_sizes: file_metadata
1607                .iter()
1608                .map(|(path, metadata)| (path.clone(), metadata.size))
1609                .collect(),
1610            file_hashes: file_metadata
1611                .into_iter()
1612                .map(|(path, metadata)| (path, metadata.content_hash))
1613                .collect(),
1614            dimension,
1615            fingerprint: None,
1616            project_root: project_root.to_path_buf(),
1617            deferred_files: HashSet::new(),
1618        })
1619    }
1620
1621    /// Build the semantic index from a set of files using the provided embedding function.
1622    /// `embed_fn` takes a batch of texts and returns a batch of embedding vectors.
1623    pub fn build<F>(
1624        project_root: &Path,
1625        files: &[PathBuf],
1626        embed_fn: &mut F,
1627        max_batch_size: usize,
1628    ) -> Result<Self, String>
1629    where
1630        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1631    {
1632        let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1633        Self::build_from_chunks(
1634            project_root,
1635            chunks,
1636            file_mtimes,
1637            embed_fn,
1638            max_batch_size,
1639            Option::<&mut fn(usize, usize)>::None,
1640        )
1641    }
1642
1643    /// Build the semantic index and report embedding progress using entry counts.
1644    pub fn build_with_progress<F, P>(
1645        project_root: &Path,
1646        files: &[PathBuf],
1647        embed_fn: &mut F,
1648        max_batch_size: usize,
1649        progress: &mut P,
1650    ) -> Result<Self, String>
1651    where
1652        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1653        P: FnMut(usize, usize),
1654    {
1655        let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1656        let total_chunks = chunks.len();
1657        progress(0, total_chunks);
1658        Self::build_from_chunks(
1659            project_root,
1660            chunks,
1661            file_mtimes,
1662            embed_fn,
1663            max_batch_size,
1664            Some(progress),
1665        )
1666    }
1667
1668    /// Incrementally refresh entries for changed/new files only, preserving cached
1669    /// embeddings for unchanged files. Used when loading the index from disk and
1670    /// finding that a small fraction of files have moved on, deleted, or appeared.
1671    ///
1672    /// Returns `RefreshSummary` describing what changed. On success, `self` is
1673    /// mutated in place and remains a valid index.
1674    ///
1675    /// `current_files` is the full set of files the project considers indexable
1676    /// (typically `walk_project_files(...)`). Files in the cache that are no
1677    /// longer in this set are treated as deleted.
1678    pub fn refresh_stale_files<F, P>(
1679        &mut self,
1680        project_root: &Path,
1681        current_files: &[PathBuf],
1682        embed_fn: &mut F,
1683        max_batch_size: usize,
1684        progress: &mut P,
1685    ) -> Result<RefreshSummary, String>
1686    where
1687        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1688        P: FnMut(usize, usize),
1689    {
1690        self.backfill_missing_file_sizes();
1691
1692        // 1. Bucket files into deleted / changed / added.
1693        let current_set: HashSet<&Path> = current_files.iter().map(PathBuf::as_path).collect();
1694        self.deferred_files
1695            .retain(|path| current_set.contains(path.as_path()));
1696        let total_processed = current_set.len() + self.file_mtimes.len()
1697            - self
1698                .file_mtimes
1699                .keys()
1700                .filter(|path| current_set.contains(path.as_path()))
1701                .count();
1702
1703        // Files in cache that disappeared from disk OR are no longer in the
1704        // walked set. Both cases need their entries dropped.
1705        let mut deleted: Vec<PathBuf> = Vec::new();
1706        let mut changed: Vec<PathBuf> = Vec::new();
1707        let indexed_paths: Vec<PathBuf> = self.file_mtimes.keys().cloned().collect();
1708        for indexed_path in &indexed_paths {
1709            if !current_set.contains(indexed_path.as_path()) {
1710                deleted.push(indexed_path.clone());
1711                continue;
1712            }
1713            let cached = match (
1714                self.file_mtimes.get(indexed_path),
1715                self.file_sizes.get(indexed_path),
1716                self.file_hashes.get(indexed_path),
1717            ) {
1718                (Some(mtime), Some(size), Some(hash)) => Some(FileFreshness {
1719                    mtime: *mtime,
1720                    size: *size,
1721                    content_hash: *hash,
1722                }),
1723                _ => None,
1724            };
1725            match cached
1726                .map(|freshness| cache_freshness::verify_file_strict(indexed_path, &freshness))
1727            {
1728                Some(FreshnessVerdict::HotFresh) => {}
1729                Some(FreshnessVerdict::ContentFresh {
1730                    new_mtime,
1731                    new_size,
1732                }) => {
1733                    self.file_mtimes.insert(indexed_path.clone(), new_mtime);
1734                    self.file_sizes.insert(indexed_path.clone(), new_size);
1735                }
1736                Some(FreshnessVerdict::Stale | FreshnessVerdict::Deleted) | None => {
1737                    changed.push(indexed_path.clone());
1738                }
1739            }
1740        }
1741
1742        // Files in walk that were never indexed.
1743        let mut added: Vec<PathBuf> = Vec::new();
1744        for path in current_files {
1745            if !self.file_mtimes.contains_key(path) {
1746                added.push(path.clone());
1747            }
1748        }
1749
1750        // Fast path: nothing to do.
1751        if deleted.is_empty() && changed.is_empty() && added.is_empty() {
1752            progress(0, 0);
1753            return Ok(RefreshSummary {
1754                total_processed,
1755                ..RefreshSummary::default()
1756            });
1757        }
1758
1759        // 2. Drop entries for deleted files immediately. Changed files are only
1760        //    replaced after successful re-extraction + embedding so transient
1761        //    read/parse errors keep the stale-but-valid cache entry.
1762        if !deleted.is_empty() {
1763            self.remove_indexed_files(&deleted);
1764        }
1765
1766        // 3. Embed the changed + added set, if any.
1767        let mut to_embed: Vec<PathBuf> = Vec::with_capacity(changed.len() + added.len());
1768        to_embed.extend(changed.iter().cloned());
1769        to_embed.extend(added.iter().cloned());
1770
1771        if to_embed.is_empty() {
1772            // Only deletions happened.
1773            progress(0, 0);
1774            return Ok(RefreshSummary {
1775                changed: 0,
1776                added: 0,
1777                deleted: deleted.len(),
1778                total_processed,
1779            });
1780        }
1781
1782        let reuse_map = self.build_chunk_reuse_map(&changed);
1783        let (chunks, fresh_metadata) = Self::collect_chunks(project_root, &to_embed);
1784        let changed_set: HashSet<&Path> = changed.iter().map(PathBuf::as_path).collect();
1785        let vanished = to_embed
1786            .iter()
1787            .filter(|path| {
1788                changed_set.contains(path.as_path())
1789                    && !fresh_metadata.contains_key(*path)
1790                    && !path.exists()
1791            })
1792            .cloned()
1793            .collect::<Vec<_>>();
1794        if !vanished.is_empty() {
1795            self.remove_indexed_files(&vanished);
1796            deleted.extend(vanished);
1797        }
1798
1799        if chunks.is_empty() {
1800            progress(0, 0);
1801            let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1802            for file in &successful_files {
1803                self.deferred_files.remove(file);
1804            }
1805            if !successful_files.is_empty() {
1806                self.entries
1807                    .retain(|entry| !successful_files.contains(&entry.chunk.file));
1808            }
1809            let changed_count = changed
1810                .iter()
1811                .filter(|path| successful_files.contains(*path))
1812                .count();
1813            let added_count = added
1814                .iter()
1815                .filter(|path| successful_files.contains(*path))
1816                .count();
1817            for (file, metadata) in fresh_metadata {
1818                self.file_mtimes.insert(file.clone(), metadata.mtime);
1819                self.file_sizes.insert(file.clone(), metadata.size);
1820                self.file_hashes.insert(file.clone(), metadata.content_hash);
1821            }
1822            return Ok(RefreshSummary {
1823                changed: changed_count,
1824                added: added_count,
1825                deleted: deleted.len(),
1826                total_processed,
1827            });
1828        }
1829
1830        // 4. Build the full replacement set, reusing cached vectors for chunks
1831        //    whose embed_text is unchanged and embedding only cache misses.
1832        let existing_dimension = if self.entries.is_empty() {
1833            None
1834        } else {
1835            Some(self.dimension)
1836        };
1837        let (new_entries, observed_dimension) = Self::entries_for_chunks_with_reuse(
1838            chunks,
1839            &reuse_map,
1840            embed_fn,
1841            max_batch_size,
1842            existing_dimension,
1843            "incremental refresh",
1844            progress,
1845        )?;
1846
1847        let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1848        for file in &successful_files {
1849            self.deferred_files.remove(file);
1850        }
1851        if !successful_files.is_empty() {
1852            self.entries
1853                .retain(|entry| !successful_files.contains(&entry.chunk.file));
1854        }
1855
1856        self.entries.extend(new_entries);
1857        for (file, metadata) in fresh_metadata {
1858            self.file_mtimes.insert(file.clone(), metadata.mtime);
1859            self.file_sizes.insert(file.clone(), metadata.size);
1860            self.file_hashes.insert(file, metadata.content_hash);
1861        }
1862        if let Some(dim) = observed_dimension {
1863            self.dimension = dim;
1864        }
1865
1866        Ok(RefreshSummary {
1867            changed: changed
1868                .iter()
1869                .filter(|path| successful_files.contains(*path))
1870                .count(),
1871            added: added
1872                .iter()
1873                .filter(|path| successful_files.contains(*path))
1874                .count(),
1875            deleted: deleted.len(),
1876            total_processed,
1877        })
1878    }
1879
1880    /// Refresh exactly the files invalidated by the live watcher, without
1881    /// treating the provided path list as the whole project. This is the
1882    /// watcher-side counterpart to `refresh_stale_files`: it drops any stale
1883    /// entries for the requested paths from this in-memory index, re-extracts
1884    /// whatever still exists on disk, embeds those chunks, and returns the
1885    /// delta needed for another in-memory index to apply the same update.
1886    pub fn refresh_invalidated_files<F, P>(
1887        &mut self,
1888        project_root: &Path,
1889        paths: &[PathBuf],
1890        embed_fn: &mut F,
1891        max_batch_size: usize,
1892        max_files: usize,
1893        progress: &mut P,
1894    ) -> Result<InvalidatedFilesRefresh, String>
1895    where
1896        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1897        P: FnMut(usize, usize),
1898    {
1899        self.backfill_missing_file_sizes();
1900
1901        self.deferred_files.retain(|path| path.exists());
1902        let mut requested_paths = paths.to_vec();
1903        requested_paths.extend(self.deferred_files.iter().cloned());
1904        requested_paths.sort();
1905        requested_paths.dedup();
1906        let total_processed = requested_paths.len();
1907
1908        if requested_paths.is_empty() {
1909            progress(0, 0);
1910            return Ok(InvalidatedFilesRefresh {
1911                summary: RefreshSummary {
1912                    total_processed,
1913                    ..RefreshSummary::default()
1914                },
1915                ..InvalidatedFilesRefresh::default()
1916            });
1917        }
1918
1919        let previously_indexed: HashSet<PathBuf> = requested_paths
1920            .iter()
1921            .filter(|path| self.file_mtimes.contains_key(*path))
1922            .cloned()
1923            .collect();
1924        let reuse_map = self.build_chunk_reuse_map(&requested_paths);
1925
1926        // The watcher path has already invalidated these files in the request
1927        // thread's live index. Mirror that behavior here before inserting any
1928        // fresh chunks so parse/read failures do not resurrect stale entries.
1929        self.remove_indexed_files(&requested_paths);
1930
1931        let existing_paths = requested_paths
1932            .iter()
1933            .filter(|path| path.exists())
1934            .cloned()
1935            .collect::<Vec<_>>();
1936        let deleted = requested_paths
1937            .iter()
1938            .filter(|path| !path.exists() && previously_indexed.contains(path.as_path()))
1939            .count();
1940
1941        if existing_paths.is_empty() {
1942            for path in &requested_paths {
1943                if !path.exists() {
1944                    self.deferred_files.remove(path);
1945                }
1946            }
1947            progress(0, 0);
1948            return Ok(InvalidatedFilesRefresh {
1949                completed_paths: requested_paths,
1950                summary: RefreshSummary {
1951                    deleted,
1952                    total_processed,
1953                    ..RefreshSummary::default()
1954                },
1955                ..InvalidatedFilesRefresh::default()
1956            });
1957        }
1958
1959        let (mut chunks, mut fresh_metadata) = Self::collect_chunks(project_root, &existing_paths);
1960
1961        let retained_file_count = self.file_mtimes.len();
1962        let changed_successful_count = existing_paths
1963            .iter()
1964            .filter(|path| {
1965                previously_indexed.contains(path.as_path()) && fresh_metadata.contains_key(*path)
1966            })
1967            .count();
1968        let available_new_files =
1969            max_files.saturating_sub(retained_file_count.saturating_add(changed_successful_count));
1970        let new_successful_files = existing_paths
1971            .iter()
1972            .filter(|path| {
1973                !previously_indexed.contains(path.as_path()) && fresh_metadata.contains_key(*path)
1974            })
1975            .cloned()
1976            .collect::<Vec<_>>();
1977        if new_successful_files.len() > available_new_files {
1978            let allowed_new_files = new_successful_files
1979                .iter()
1980                .take(available_new_files)
1981                .cloned()
1982                .collect::<HashSet<_>>();
1983            let deferred_new_files = new_successful_files
1984                .into_iter()
1985                .filter(|path| !allowed_new_files.contains(path))
1986                .collect::<HashSet<_>>();
1987
1988            fresh_metadata.retain(|file, _| {
1989                previously_indexed.contains(file.as_path()) || allowed_new_files.contains(file)
1990            });
1991            chunks.retain(|chunk| !deferred_new_files.contains(&chunk.file));
1992
1993            if !deferred_new_files.is_empty() {
1994                for path in &deferred_new_files {
1995                    self.deferred_files.insert(path.clone());
1996                }
1997                slog_warn!(
1998                    "semantic refresh deferred {} new file(s): indexed-file cap {} is reached",
1999                    deferred_new_files.len(),
2000                    max_files
2001                );
2002            }
2003        }
2004
2005        let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
2006        for file in &successful_files {
2007            self.deferred_files.remove(file);
2008        }
2009        let changed = successful_files
2010            .iter()
2011            .filter(|path| previously_indexed.contains(path.as_path()))
2012            .count();
2013        let added = successful_files.len().saturating_sub(changed);
2014        let mut updated_metadata = Vec::with_capacity(fresh_metadata.len());
2015
2016        if chunks.is_empty() {
2017            progress(0, 0);
2018            for (file, metadata) in fresh_metadata {
2019                let freshness = FileFreshness {
2020                    mtime: metadata.mtime,
2021                    size: metadata.size,
2022                    content_hash: metadata.content_hash,
2023                };
2024                self.file_mtimes.insert(file.clone(), freshness.mtime);
2025                self.file_sizes.insert(file.clone(), freshness.size);
2026                self.file_hashes
2027                    .insert(file.clone(), freshness.content_hash);
2028                updated_metadata.push((file, freshness));
2029            }
2030
2031            return Ok(InvalidatedFilesRefresh {
2032                updated_metadata,
2033                completed_paths: requested_paths,
2034                summary: RefreshSummary {
2035                    changed,
2036                    added,
2037                    deleted,
2038                    total_processed,
2039                },
2040                ..InvalidatedFilesRefresh::default()
2041            });
2042        }
2043
2044        let initial_observed_dimension = if self.entries.is_empty() && previously_indexed.is_empty()
2045        {
2046            None
2047        } else {
2048            Some(self.dimension)
2049        };
2050        let (new_entries, observed_dimension) = Self::entries_for_chunks_with_reuse(
2051            chunks,
2052            &reuse_map,
2053            embed_fn,
2054            max_batch_size,
2055            initial_observed_dimension,
2056            "invalidated-file refresh",
2057            progress,
2058        )?;
2059
2060        let added_entries = new_entries.clone();
2061        self.entries.extend(new_entries);
2062        for (file, metadata) in fresh_metadata {
2063            let freshness = FileFreshness {
2064                mtime: metadata.mtime,
2065                size: metadata.size,
2066                content_hash: metadata.content_hash,
2067            };
2068            self.file_mtimes.insert(file.clone(), freshness.mtime);
2069            self.file_sizes.insert(file.clone(), freshness.size);
2070            self.file_hashes
2071                .insert(file.clone(), freshness.content_hash);
2072            updated_metadata.push((file, freshness));
2073        }
2074        if let Some(dim) = observed_dimension {
2075            self.dimension = dim;
2076        }
2077
2078        Ok(InvalidatedFilesRefresh {
2079            added_entries,
2080            updated_metadata,
2081            completed_paths: requested_paths,
2082            summary: RefreshSummary {
2083                changed,
2084                added,
2085                deleted,
2086                total_processed,
2087            },
2088        })
2089    }
2090
2091    pub fn apply_refresh_update(
2092        &mut self,
2093        added_entries: Vec<EmbeddingEntry>,
2094        updated_metadata: Vec<(PathBuf, FileFreshness)>,
2095        completed_paths: &[PathBuf],
2096    ) {
2097        // `added_entries` is the complete replacement set for completed paths:
2098        // freshly embedded misses plus reused chunks carrying refreshed metadata.
2099        // Removing first is safe only because producers include both kinds.
2100        self.remove_indexed_files(completed_paths);
2101
2102        let observed_dimension = added_entries.first().map(|entry| entry.vector.len());
2103        self.entries.extend(added_entries);
2104        for (file, freshness) in updated_metadata {
2105            self.file_mtimes.insert(file.clone(), freshness.mtime);
2106            self.file_sizes.insert(file.clone(), freshness.size);
2107            self.file_hashes.insert(file, freshness.content_hash);
2108        }
2109        if let Some(dim) = observed_dimension {
2110            self.dimension = dim;
2111        }
2112    }
2113
2114    fn remove_indexed_files(&mut self, files: &[PathBuf]) {
2115        let deleted_set: HashSet<&Path> = files.iter().map(PathBuf::as_path).collect();
2116        self.entries
2117            .retain(|entry| !deleted_set.contains(entry.chunk.file.as_path()));
2118        for path in files {
2119            self.file_mtimes.remove(path);
2120            self.file_sizes.remove(path);
2121            self.file_hashes.remove(path);
2122        }
2123    }
2124
2125    /// Search the index with a query embedding, returning top-K results sorted by relevance
2126    pub fn search(&self, query_vector: &[f32], top_k: usize) -> Vec<SemanticResult> {
2127        if self.entries.is_empty() || query_vector.len() != self.dimension {
2128            return Vec::new();
2129        }
2130
2131        let mut scored: Vec<(f32, usize)> = self
2132            .entries
2133            .iter()
2134            .enumerate()
2135            .map(|(i, entry)| {
2136                let mut score = cosine_similarity(query_vector, &entry.vector);
2137                if entry.chunk.exported {
2138                    score *= 1.1;
2139                }
2140                (score, i)
2141            })
2142            .collect();
2143
2144        // Sort descending by score
2145        scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
2146
2147        scored
2148            .into_iter()
2149            .take(top_k)
2150            // Keep the sort → take → map ordering explicit: removing the old
2151            // `> 0.0` floor cannot evict positive hits because top_k has already
2152            // been selected, but it can surface zero-score noise in the tail.
2153            .map(|(score, idx)| {
2154                let entry = &self.entries[idx];
2155                SemanticResult {
2156                    file: entry.chunk.file.clone(),
2157                    name: entry.chunk.name.clone(),
2158                    kind: entry.chunk.kind.clone(),
2159                    start_line: entry.chunk.start_line,
2160                    end_line: entry.chunk.end_line,
2161                    exported: entry.chunk.exported,
2162                    snippet: entry.chunk.snippet.clone(),
2163                    score,
2164                    source: "semantic",
2165                }
2166            })
2167            .collect()
2168    }
2169
2170    /// Number of indexed entries
2171    pub fn len(&self) -> usize {
2172        self.entries.len()
2173    }
2174
2175    /// Check if a file needs re-indexing based on mtime/size
2176    pub fn is_file_stale(&self, file: &Path) -> bool {
2177        let Some(stored_mtime) = self.file_mtimes.get(file) else {
2178            return true;
2179        };
2180        let Some(stored_size) = self.file_sizes.get(file) else {
2181            return true;
2182        };
2183        let Some(stored_hash) = self.file_hashes.get(file) else {
2184            return true;
2185        };
2186        let cached = FileFreshness {
2187            mtime: *stored_mtime,
2188            size: *stored_size,
2189            content_hash: *stored_hash,
2190        };
2191        match cache_freshness::verify_file_strict(file, &cached) {
2192            FreshnessVerdict::HotFresh => false,
2193            FreshnessVerdict::ContentFresh { .. } => false,
2194            FreshnessVerdict::Stale | FreshnessVerdict::Deleted => true,
2195        }
2196    }
2197
2198    fn backfill_missing_file_sizes(&mut self) {
2199        for path in self.file_mtimes.keys() {
2200            if self.file_sizes.contains_key(path) {
2201                continue;
2202            }
2203            if let Ok(metadata) = fs::metadata(path) {
2204                self.file_sizes.insert(path.clone(), metadata.len());
2205                if let Ok(Some(hash)) = cache_freshness::hash_file_if_small(path, metadata.len()) {
2206                    self.file_hashes.insert(path.clone(), hash);
2207                }
2208            }
2209        }
2210    }
2211
2212    /// Remove entries for a specific file
2213    pub fn remove_file(&mut self, file: &Path) {
2214        self.invalidate_file(file);
2215    }
2216
2217    pub fn invalidate_file(&mut self, file: &Path) {
2218        let canonical_file = canonicalize_existing_or_deleted_path(file);
2219        self.entries
2220            .retain(|e| e.chunk.file != file && e.chunk.file != canonical_file);
2221        self.file_mtimes.remove(file);
2222        self.file_sizes.remove(file);
2223        self.file_hashes.remove(file);
2224        if canonical_file.as_path() != file {
2225            self.file_mtimes.remove(&canonical_file);
2226            self.file_sizes.remove(&canonical_file);
2227            self.file_hashes.remove(&canonical_file);
2228        }
2229    }
2230
2231    /// Get the embedding dimension
2232    pub fn dimension(&self) -> usize {
2233        self.dimension
2234    }
2235
2236    pub fn fingerprint(&self) -> Option<&SemanticIndexFingerprint> {
2237        self.fingerprint.as_ref()
2238    }
2239
2240    pub fn backend_label(&self) -> Option<&str> {
2241        self.fingerprint.as_ref().map(|f| f.backend.as_str())
2242    }
2243
2244    pub fn model_label(&self) -> Option<&str> {
2245        self.fingerprint.as_ref().map(|f| f.model.as_str())
2246    }
2247
2248    pub fn set_fingerprint(&mut self, fingerprint: SemanticIndexFingerprint) {
2249        self.fingerprint = Some(fingerprint);
2250    }
2251
2252    /// Write the semantic index to disk using atomic temp+rename pattern
2253    pub fn write_to_disk(&self, storage_dir: &Path, project_key: &str) {
2254        // Don't persist empty indexes — they would be loaded on next startup
2255        // and prevent a fresh build that might find files.
2256        if self.entries.is_empty() {
2257            slog_info!("skipping semantic index persistence (0 entries)");
2258            return;
2259        }
2260        let dir = storage_dir.join("semantic").join(project_key);
2261        if let Err(e) = fs::create_dir_all(&dir) {
2262            slog_warn!("failed to create semantic cache dir: {}", e);
2263            return;
2264        }
2265        let data_path = dir.join("semantic.bin");
2266        let tmp_path = dir.join(format!(
2267            "semantic.bin.tmp.{}.{}",
2268            std::process::id(),
2269            SystemTime::now()
2270                .duration_since(SystemTime::UNIX_EPOCH)
2271                .unwrap_or(Duration::ZERO)
2272                .as_nanos()
2273        ));
2274        let bytes = self.to_bytes();
2275        let write_result = (|| -> std::io::Result<()> {
2276            use std::io::Write;
2277            let mut file = fs::File::create(&tmp_path)?;
2278            file.write_all(&bytes)?;
2279            file.sync_all()?;
2280            Ok(())
2281        })();
2282        if let Err(e) = write_result {
2283            slog_warn!("failed to write semantic index: {}", e);
2284            let _ = fs::remove_file(&tmp_path);
2285            return;
2286        }
2287        if let Err(e) = fs::rename(&tmp_path, &data_path) {
2288            slog_warn!("failed to rename semantic index: {}", e);
2289            let _ = fs::remove_file(&tmp_path);
2290            return;
2291        }
2292        slog_info!(
2293            "semantic index persisted: {} entries, {:.1} KB",
2294            self.entries.len(),
2295            bytes.len() as f64 / 1024.0
2296        );
2297    }
2298
2299    /// Read the semantic index from disk
2300    pub fn read_from_disk(
2301        storage_dir: &Path,
2302        project_key: &str,
2303        current_canonical_root: &Path,
2304        is_worktree_bridge: bool,
2305        expected_fingerprint: Option<&str>,
2306    ) -> Option<Self> {
2307        debug_assert!(current_canonical_root.is_absolute());
2308        let data_path = storage_dir
2309            .join("semantic")
2310            .join(project_key)
2311            .join("semantic.bin");
2312        let file_len = usize::try_from(fs::metadata(&data_path).ok()?.len()).ok()?;
2313        if file_len < HEADER_BYTES_V1 {
2314            slog_warn!(
2315                "corrupt semantic index (too small: {} bytes), removing",
2316                file_len
2317            );
2318            if !is_worktree_bridge {
2319                let _ = fs::remove_file(&data_path);
2320            }
2321            return None;
2322        }
2323
2324        let bytes = fs::read(&data_path).ok()?;
2325        let version = bytes[0];
2326        if version != SEMANTIC_INDEX_VERSION_V6 {
2327            slog_info!(
2328                "cached semantic index version {} is older than {}, rebuilding",
2329                version,
2330                SEMANTIC_INDEX_VERSION_V6
2331            );
2332            if !is_worktree_bridge {
2333                let _ = fs::remove_file(&data_path);
2334            }
2335            return None;
2336        }
2337        match Self::from_bytes(&bytes, current_canonical_root) {
2338            Ok(index) => {
2339                if index.entries.is_empty() {
2340                    slog_info!("cached semantic index is empty, will rebuild");
2341                    if !is_worktree_bridge {
2342                        let _ = fs::remove_file(&data_path);
2343                    }
2344                    return None;
2345                }
2346                if let Some(expected) = expected_fingerprint {
2347                    let matches = index
2348                        .fingerprint()
2349                        .map(|fingerprint| fingerprint.matches_expected(expected))
2350                        .unwrap_or(false);
2351                    if !matches {
2352                        slog_info!("cached semantic index fingerprint mismatch, rebuilding");
2353                        if !is_worktree_bridge {
2354                            let _ = fs::remove_file(&data_path);
2355                        }
2356                        return None;
2357                    }
2358                }
2359                slog_info!(
2360                    "loaded semantic index from disk: {} entries",
2361                    index.entries.len()
2362                );
2363                Some(index)
2364            }
2365            Err(e) => {
2366                slog_warn!("corrupt semantic index, rebuilding: {}", e);
2367                if !is_worktree_bridge {
2368                    let _ = fs::remove_file(&data_path);
2369                }
2370                None
2371            }
2372        }
2373    }
2374
2375    /// Serialize the index to bytes for disk persistence
2376    pub fn to_bytes(&self) -> Vec<u8> {
2377        let mut buf = Vec::new();
2378        let fingerprint_bytes = self.fingerprint.as_ref().and_then(|fingerprint| {
2379            let encoded = fingerprint.as_string();
2380            if encoded.is_empty() {
2381                None
2382            } else {
2383                Some(encoded.into_bytes())
2384            }
2385        });
2386        let file_mtimes: Vec<_> = self
2387            .file_mtimes
2388            .iter()
2389            .filter_map(|(path, mtime)| {
2390                cache_relative_path(&self.project_root, path)
2391                    .map(|relative| (relative, path, mtime))
2392            })
2393            .collect();
2394        let entries: Vec<_> = self
2395            .entries
2396            .iter()
2397            .filter_map(|entry| {
2398                cache_relative_path(&self.project_root, &entry.chunk.file)
2399                    .map(|relative| (relative, entry))
2400            })
2401            .collect();
2402
2403        // Header: version(1) + dimension(4) + entry_count(4) + fingerprint_len(4) + fingerprint
2404        //
2405        // V6 is the single write format. Layout extends V5:
2406        //   - fingerprint is always represented (absent ⇒ fingerprint_len=0,
2407        //     no bytes follow). Uniform format simplifies the reader.
2408        //   - paths are relative to project_root.
2409        //   - file metadata stored as secs(u64) + subsec_nanos(u32) + size(u64) + blake3(32).
2410        //     Preserves full APFS/ext4/NTFS precision and catches mtime ties.
2411        //
2412        // V1/V2 remain readable for backward compatibility (see from_bytes).
2413        // V3/V4 load as compatible formats but are rejected on disk so snippets
2414        // and file sizes are rebuilt once.
2415        let version = SEMANTIC_INDEX_VERSION_V6;
2416        buf.push(version);
2417        buf.extend_from_slice(&(self.dimension as u32).to_le_bytes());
2418        buf.extend_from_slice(&(entries.len() as u32).to_le_bytes());
2419        let fp_bytes_ref: &[u8] = fingerprint_bytes.as_deref().unwrap_or(&[]);
2420        buf.extend_from_slice(&(fp_bytes_ref.len() as u32).to_le_bytes());
2421        buf.extend_from_slice(fp_bytes_ref);
2422
2423        // File mtime table: count(4) + entries
2424        // V3 layout per entry: path_len(4) + path + secs(8) + subsec_nanos(4)
2425        buf.extend_from_slice(&(file_mtimes.len() as u32).to_le_bytes());
2426        for (relative, path, mtime) in &file_mtimes {
2427            let path_bytes = relative.to_string_lossy().as_bytes().to_vec();
2428            buf.extend_from_slice(&(path_bytes.len() as u32).to_le_bytes());
2429            buf.extend_from_slice(&path_bytes);
2430            let duration = mtime
2431                .duration_since(SystemTime::UNIX_EPOCH)
2432                .unwrap_or_default();
2433            buf.extend_from_slice(&duration.as_secs().to_le_bytes());
2434            buf.extend_from_slice(&duration.subsec_nanos().to_le_bytes());
2435            let size = self.file_sizes.get(*path).copied().unwrap_or_default();
2436            buf.extend_from_slice(&size.to_le_bytes());
2437            let hash = self
2438                .file_hashes
2439                .get(*path)
2440                .copied()
2441                .unwrap_or_else(cache_freshness::zero_hash);
2442            buf.extend_from_slice(hash.as_bytes());
2443        }
2444
2445        // Entries: each is metadata + vector
2446        for (relative, entry) in &entries {
2447            let c = &entry.chunk;
2448
2449            // File path
2450            let file_bytes = relative.to_string_lossy().as_bytes().to_vec();
2451            buf.extend_from_slice(&(file_bytes.len() as u32).to_le_bytes());
2452            buf.extend_from_slice(&file_bytes);
2453
2454            // Name
2455            let name_bytes = c.name.as_bytes();
2456            buf.extend_from_slice(&(name_bytes.len() as u32).to_le_bytes());
2457            buf.extend_from_slice(name_bytes);
2458
2459            // Kind (1 byte)
2460            buf.push(symbol_kind_to_u8(&c.kind));
2461
2462            // Lines + exported
2463            buf.extend_from_slice(&(c.start_line as u32).to_le_bytes());
2464            buf.extend_from_slice(&(c.end_line as u32).to_le_bytes());
2465            buf.push(c.exported as u8);
2466
2467            // Snippet
2468            let snippet_bytes = c.snippet.as_bytes();
2469            buf.extend_from_slice(&(snippet_bytes.len() as u32).to_le_bytes());
2470            buf.extend_from_slice(snippet_bytes);
2471
2472            // Embed text
2473            let embed_bytes = c.embed_text.as_bytes();
2474            buf.extend_from_slice(&(embed_bytes.len() as u32).to_le_bytes());
2475            buf.extend_from_slice(embed_bytes);
2476
2477            // Vector (f32 array)
2478            for &val in &entry.vector {
2479                buf.extend_from_slice(&val.to_le_bytes());
2480            }
2481        }
2482
2483        buf
2484    }
2485
2486    /// Deserialize the index from bytes
2487    pub fn from_bytes(data: &[u8], current_canonical_root: &Path) -> Result<Self, String> {
2488        debug_assert!(current_canonical_root.is_absolute());
2489        let mut pos = 0;
2490
2491        if data.len() < HEADER_BYTES_V1 {
2492            return Err("data too short".to_string());
2493        }
2494
2495        let version = data[pos];
2496        pos += 1;
2497        if version != SEMANTIC_INDEX_VERSION_V1
2498            && version != SEMANTIC_INDEX_VERSION_V2
2499            && version != SEMANTIC_INDEX_VERSION_V3
2500            && version != SEMANTIC_INDEX_VERSION_V4
2501            && version != SEMANTIC_INDEX_VERSION_V5
2502            && version != SEMANTIC_INDEX_VERSION_V6
2503        {
2504            return Err(format!("unsupported version: {}", version));
2505        }
2506        // V2 and newer share the same header layout (V3/V4/V5 only differ from
2507        // V2 in the per-mtime entry layout): version(1) + dimension(4) +
2508        // entry_count(4) + fingerprint_len(4) + fingerprint bytes.
2509        if (version == SEMANTIC_INDEX_VERSION_V2
2510            || version == SEMANTIC_INDEX_VERSION_V3
2511            || version == SEMANTIC_INDEX_VERSION_V4
2512            || version == SEMANTIC_INDEX_VERSION_V5
2513            || version == SEMANTIC_INDEX_VERSION_V6)
2514            && data.len() < HEADER_BYTES_V2
2515        {
2516            return Err("data too short for semantic index v2/v3/v4/v5/v6 header".to_string());
2517        }
2518
2519        let dimension = read_u32(data, &mut pos)? as usize;
2520        let entry_count = read_u32(data, &mut pos)? as usize;
2521        validate_embedding_dimension(dimension)?;
2522        if entry_count > MAX_ENTRIES {
2523            return Err(format!("too many semantic index entries: {}", entry_count));
2524        }
2525
2526        // Fingerprint handling:
2527        //   - V1: no fingerprint field at all.
2528        //   - V2: fingerprint_len + fingerprint bytes; always present (writer
2529        //     only emitted V2 when fingerprint was Some).
2530        //   - V3+: fingerprint_len always present; fingerprint_len==0 ⇒ None.
2531        let has_fingerprint_field = version == SEMANTIC_INDEX_VERSION_V2
2532            || version == SEMANTIC_INDEX_VERSION_V3
2533            || version == SEMANTIC_INDEX_VERSION_V4
2534            || version == SEMANTIC_INDEX_VERSION_V5
2535            || version == SEMANTIC_INDEX_VERSION_V6;
2536        let fingerprint = if has_fingerprint_field {
2537            let fingerprint_len = read_u32(data, &mut pos)? as usize;
2538            if pos + fingerprint_len > data.len() {
2539                return Err("unexpected end of data reading fingerprint".to_string());
2540            }
2541            if fingerprint_len == 0 {
2542                None
2543            } else {
2544                let raw = String::from_utf8_lossy(&data[pos..pos + fingerprint_len]).to_string();
2545                pos += fingerprint_len;
2546                Some(
2547                    serde_json::from_str::<SemanticIndexFingerprint>(&raw)
2548                        .map_err(|error| format!("invalid semantic fingerprint: {error}"))?,
2549                )
2550            }
2551        } else {
2552            None
2553        };
2554
2555        // File mtimes
2556        let mtime_count = read_u32(data, &mut pos)? as usize;
2557        if mtime_count > MAX_ENTRIES {
2558            return Err(format!("too many semantic file mtimes: {}", mtime_count));
2559        }
2560
2561        let vector_bytes = entry_count
2562            .checked_mul(dimension)
2563            .and_then(|count| count.checked_mul(F32_BYTES))
2564            .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
2565        if vector_bytes > data.len().saturating_sub(pos) {
2566            return Err("semantic index vectors exceed available data".to_string());
2567        }
2568
2569        let mut file_mtimes = HashMap::with_capacity(mtime_count);
2570        let mut file_sizes = HashMap::with_capacity(mtime_count);
2571        let mut file_hashes = HashMap::with_capacity(mtime_count);
2572        for _ in 0..mtime_count {
2573            let path = read_string(data, &mut pos)?;
2574            let secs = read_u64(data, &mut pos)?;
2575            // V3+ persists subsec_nanos alongside secs so staleness checks
2576            // survive restart round-trips. V1/V2 load with 0 nanos, which
2577            // causes one rebuild on upgrade (they never matched live APFS
2578            // mtimes anyway — the bug v0.15.2 fixes). After that rebuild,
2579            // the cache is persisted as V3 and stabilises.
2580            let nanos = if version == SEMANTIC_INDEX_VERSION_V3
2581                || version == SEMANTIC_INDEX_VERSION_V4
2582                || version == SEMANTIC_INDEX_VERSION_V5
2583                || version == SEMANTIC_INDEX_VERSION_V6
2584            {
2585                read_u32(data, &mut pos)?
2586            } else {
2587                0
2588            };
2589            let size =
2590                if version == SEMANTIC_INDEX_VERSION_V5 || version == SEMANTIC_INDEX_VERSION_V6 {
2591                    read_u64(data, &mut pos)?
2592                } else {
2593                    0
2594                };
2595            let content_hash = if version == SEMANTIC_INDEX_VERSION_V6 {
2596                if pos + 32 > data.len() {
2597                    return Err("unexpected end of data reading content hash".to_string());
2598                }
2599                let mut hash_bytes = [0u8; 32];
2600                hash_bytes.copy_from_slice(&data[pos..pos + 32]);
2601                pos += 32;
2602                blake3::Hash::from_bytes(hash_bytes)
2603            } else {
2604                cache_freshness::zero_hash()
2605            };
2606            // Hardening against corrupt / maliciously crafted cache files
2607            // (v0.15.2). `Duration::new(secs, nanos)` can panic when the
2608            // nanosecond carry overflows the second counter, and
2609            // `SystemTime + Duration` can panic on carry past the platform's
2610            // upper bound. Explicit validation keeps a corrupted semantic.bin
2611            // from taking down the whole aft process.
2612            if nanos >= 1_000_000_000 {
2613                return Err(format!(
2614                    "invalid semantic mtime: nanos {} >= 1_000_000_000",
2615                    nanos
2616                ));
2617            }
2618            let duration = std::time::Duration::new(secs, nanos);
2619            let mtime = SystemTime::UNIX_EPOCH
2620                .checked_add(duration)
2621                .ok_or_else(|| {
2622                    format!(
2623                        "invalid semantic mtime: secs={} nanos={} overflows SystemTime",
2624                        secs, nanos
2625                    )
2626                })?;
2627            let path = if version == SEMANTIC_INDEX_VERSION_V6 {
2628                cached_path_under_root(current_canonical_root, &PathBuf::from(path))
2629                    .ok_or_else(|| "cached semantic mtime path escapes project root".to_string())?
2630            } else {
2631                PathBuf::from(path)
2632            };
2633            file_mtimes.insert(path.clone(), mtime);
2634            file_sizes.insert(path.clone(), size);
2635            file_hashes.insert(path, content_hash);
2636        }
2637
2638        // Entries
2639        let mut entries = Vec::with_capacity(entry_count);
2640        for _ in 0..entry_count {
2641            let raw_file = PathBuf::from(read_string(data, &mut pos)?);
2642            let file = if version == SEMANTIC_INDEX_VERSION_V6 {
2643                cached_path_under_root(current_canonical_root, &raw_file)
2644                    .ok_or_else(|| "cached semantic entry path escapes project root".to_string())?
2645            } else {
2646                raw_file
2647            };
2648            let name = read_string(data, &mut pos)?;
2649
2650            if pos >= data.len() {
2651                return Err("unexpected end of data".to_string());
2652            }
2653            let kind = u8_to_symbol_kind(data[pos]);
2654            pos += 1;
2655
2656            let start_line = read_u32(data, &mut pos)?;
2657            let end_line = read_u32(data, &mut pos)?;
2658
2659            if pos >= data.len() {
2660                return Err("unexpected end of data".to_string());
2661            }
2662            let exported = data[pos] != 0;
2663            pos += 1;
2664
2665            let snippet = read_string(data, &mut pos)?;
2666            let embed_text = read_string(data, &mut pos)?;
2667
2668            // Vector
2669            let vec_bytes = dimension
2670                .checked_mul(F32_BYTES)
2671                .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
2672            if pos + vec_bytes > data.len() {
2673                return Err("unexpected end of data reading vector".to_string());
2674            }
2675            let mut vector = Vec::with_capacity(dimension);
2676            for _ in 0..dimension {
2677                let bytes = [data[pos], data[pos + 1], data[pos + 2], data[pos + 3]];
2678                vector.push(f32::from_le_bytes(bytes));
2679                pos += 4;
2680            }
2681
2682            entries.push(EmbeddingEntry {
2683                chunk: SemanticChunk {
2684                    file,
2685                    name,
2686                    kind,
2687                    start_line,
2688                    end_line,
2689                    exported,
2690                    embed_text,
2691                    snippet,
2692                },
2693                vector,
2694            });
2695        }
2696
2697        if entries.len() != entry_count {
2698            return Err(format!(
2699                "semantic cache entry count drift: header={} decoded={}",
2700                entry_count,
2701                entries.len()
2702            ));
2703        }
2704        for entry in &entries {
2705            if !file_mtimes.contains_key(&entry.chunk.file) {
2706                return Err(format!(
2707                    "semantic cache metadata missing for entry file {}",
2708                    entry.chunk.file.display()
2709                ));
2710            }
2711        }
2712
2713        Ok(Self {
2714            entries,
2715            file_mtimes,
2716            file_sizes,
2717            file_hashes,
2718            dimension,
2719            fingerprint,
2720            project_root: current_canonical_root.to_path_buf(),
2721            deferred_files: HashSet::new(),
2722        })
2723    }
2724}
2725
2726/// Build enriched embedding text from a symbol with cAST-style context
2727fn build_embed_text(symbol: &Symbol, source: &str, file: &Path, project_root: &Path) -> String {
2728    let relative = file
2729        .strip_prefix(project_root)
2730        .unwrap_or(file)
2731        .to_string_lossy();
2732
2733    let kind_label = match &symbol.kind {
2734        SymbolKind::Function => "function",
2735        SymbolKind::Class => "class",
2736        SymbolKind::Method => "method",
2737        SymbolKind::Struct => "struct",
2738        SymbolKind::Interface => "interface",
2739        SymbolKind::Enum => "enum",
2740        SymbolKind::TypeAlias => "type",
2741        SymbolKind::Variable => "variable",
2742        SymbolKind::Heading => "heading",
2743        SymbolKind::FileSummary => "file-summary",
2744    };
2745
2746    // Build: "file:relative/path kind:function name:validateAuth signature:fn validateAuth(token: &str) -> bool"
2747    let name = &symbol.name;
2748    let mut text = format!(
2749        "name:{name} file:{} kind:{} name:{name}",
2750        relative, kind_label
2751    );
2752
2753    if let Some(sig) = &symbol.signature {
2754        // Cap the signature: structured parsers (e.g. YAML/Kubernetes) pack
2755        // entire inline scripts (CronJob/Job `command:` bodies, multi-KB) into
2756        // the signature. Appending it unbounded produces a single embed_text
2757        // that overflows the embedding backend's physical batch (e.g. a
2758        // llama.cpp server's 512-token cap), aborting the whole index build
2759        // and silently degrading every search to lexical. 400 chars keeps the
2760        // identifying head of the signature without blowing the budget.
2761        text.push_str(&format!(" signature:{}", truncate_chars(sig, 400)));
2762    }
2763
2764    // Add body snippet (first ~300 chars of symbol body)
2765    let lines: Vec<&str> = source.lines().collect();
2766    let start = (symbol.range.start_line as usize).min(lines.len());
2767    // range.end_line is inclusive 0-based; +1 makes it an exclusive slice bound.
2768    let end = (symbol.range.end_line as usize + 1).min(lines.len());
2769    if start < end {
2770        let body: String = lines[start..end]
2771            .iter()
2772            .take(15) // max 15 lines
2773            .copied()
2774            .collect::<Vec<&str>>()
2775            .join("\n");
2776        let snippet = if body.len() > 300 {
2777            format!("{}...", &body[..body.floor_char_boundary(300)])
2778        } else {
2779            body
2780        };
2781        text.push_str(&format!(" body:{}", snippet));
2782    }
2783
2784    // Final defense-in-depth clamp: no single embed_text may exceed the
2785    // backend's per-input budget regardless of which field grew. Most
2786    // backends cap a physical batch around 512 tokens; ~1600 chars stays
2787    // comfortably under that for typical English/code (≈4 chars/token).
2788    truncate_chars(&text, MAX_EMBED_TEXT_CHARS)
2789}
2790
2791/// Upper bound on characters in a single chunk's `embed_text`. Keeps any one
2792/// input below typical embedding-backend physical batch limits (~512 tokens)
2793/// so an oversized symbol cannot abort the whole index build.
2794const MAX_EMBED_TEXT_CHARS: usize = 1600;
2795
2796fn truncate_chars(value: &str, max_chars: usize) -> String {
2797    value.chars().take(max_chars).collect()
2798}
2799
2800fn first_leading_doc_comment(source: &str) -> String {
2801    let lines: Vec<&str> = source.lines().collect();
2802    let Some((start, first)) = lines
2803        .iter()
2804        .enumerate()
2805        .find(|(_, line)| !line.trim().is_empty())
2806    else {
2807        return String::new();
2808    };
2809
2810    let trimmed = first.trim_start();
2811    if trimmed.starts_with("/**") {
2812        let mut comment = Vec::new();
2813        for line in lines.iter().skip(start) {
2814            comment.push(*line);
2815            if line.contains("*/") {
2816                break;
2817            }
2818        }
2819        return truncate_chars(&comment.join("\n"), 200);
2820    }
2821
2822    if trimmed.starts_with("///") || trimmed.starts_with("//!") {
2823        let comment = lines
2824            .iter()
2825            .skip(start)
2826            .take_while(|line| {
2827                let trimmed = line.trim_start();
2828                trimmed.starts_with("///") || trimmed.starts_with("//!")
2829            })
2830            .copied()
2831            .collect::<Vec<_>>()
2832            .join("\n");
2833        return truncate_chars(&comment, 200);
2834    }
2835
2836    String::new()
2837}
2838
2839pub fn build_file_summary_chunk(
2840    file: &Path,
2841    project_root: &Path,
2842    source: &str,
2843    top_exports: &[&str],
2844    top_export_signatures: &[Option<&str>],
2845) -> SemanticChunk {
2846    let relative = file.strip_prefix(project_root).unwrap_or(file);
2847    let rel_path = relative.to_string_lossy();
2848    let parent_dir = relative
2849        .parent()
2850        .map(|parent| parent.to_string_lossy().to_string())
2851        .unwrap_or_default();
2852    let name = file
2853        .file_stem()
2854        .map(|stem| stem.to_string_lossy().to_string())
2855        .unwrap_or_default();
2856    let doc = first_leading_doc_comment(source);
2857    let exports = top_exports
2858        .iter()
2859        .take(5)
2860        .copied()
2861        .collect::<Vec<_>>()
2862        .join(",");
2863    let snippet = if doc.is_empty() {
2864        top_export_signatures
2865            .first()
2866            .and_then(|signature| signature.as_deref())
2867            .map(|signature| truncate_chars(signature, 200))
2868            .unwrap_or_default()
2869    } else {
2870        doc.clone()
2871    };
2872
2873    SemanticChunk {
2874        file: file.to_path_buf(),
2875        name,
2876        kind: SymbolKind::FileSummary,
2877        start_line: 0,
2878        end_line: 0,
2879        exported: false,
2880        embed_text: truncate_chars(
2881            &format!(
2882                "file:{rel_path} kind:file-summary name:{} parent:{parent_dir} doc:{doc} exports:{exports}",
2883                file.file_stem()
2884                    .map(|stem| stem.to_string_lossy().to_string())
2885                    .unwrap_or_default()
2886            ),
2887            MAX_EMBED_TEXT_CHARS,
2888        ),
2889        snippet,
2890    }
2891}
2892
2893fn parser_for(
2894    parsers: &mut HashMap<crate::parser::LangId, Parser>,
2895    lang: crate::parser::LangId,
2896) -> Result<&mut Parser, String> {
2897    use std::collections::hash_map::Entry;
2898
2899    match parsers.entry(lang) {
2900        Entry::Occupied(entry) => Ok(entry.into_mut()),
2901        Entry::Vacant(entry) => {
2902            let grammar = grammar_for(lang);
2903            let mut parser = Parser::new();
2904            parser
2905                .set_language(&grammar)
2906                .map_err(|error| error.to_string())?;
2907            Ok(entry.insert(parser))
2908        }
2909    }
2910}
2911
2912pub fn is_semantic_indexed_extension(path: &Path) -> bool {
2913    matches!(
2914        path.extension().and_then(|extension| extension.to_str()),
2915        Some(
2916            "ts" | "tsx"
2917                | "js"
2918                | "jsx"
2919                | "py"
2920                | "rs"
2921                | "go"
2922                | "c"
2923                | "h"
2924                | "cc"
2925                | "cpp"
2926                | "cxx"
2927                | "hpp"
2928                | "hh"
2929                | "zig"
2930                | "cs"
2931                | "sh"
2932                | "bash"
2933                | "zsh"
2934                | "inc"
2935                | "php"
2936                | "sol"
2937                | "scss"
2938                | "vue"
2939                | "yaml"
2940                | "yml"
2941        )
2942    )
2943}
2944
2945fn collect_file_metadata(file: &Path) -> Result<IndexedFileMetadata, String> {
2946    let metadata = fs::metadata(file).map_err(|error| error.to_string())?;
2947    let mtime = metadata.modified().map_err(|error| error.to_string())?;
2948    let content_hash = cache_freshness::hash_file_if_small(file, metadata.len())
2949        .map_err(|error| error.to_string())?
2950        .unwrap_or_else(cache_freshness::zero_hash);
2951    Ok(IndexedFileMetadata {
2952        mtime,
2953        size: metadata.len(),
2954        content_hash,
2955    })
2956}
2957
2958fn canonicalize_existing_or_deleted_path(path: &Path) -> PathBuf {
2959    if let Ok(canonical) = fs::canonicalize(path) {
2960        return canonical;
2961    }
2962
2963    let Some(parent) = path.parent() else {
2964        return path.to_path_buf();
2965    };
2966    let Some(file_name) = path.file_name() else {
2967        return path.to_path_buf();
2968    };
2969
2970    fs::canonicalize(parent)
2971        .map(|canonical_parent| canonical_parent.join(file_name))
2972        .unwrap_or_else(|_| path.to_path_buf())
2973}
2974
2975fn collect_file_chunks(
2976    project_root: &Path,
2977    file: &Path,
2978    parsers: &mut HashMap<crate::parser::LangId, Parser>,
2979) -> Result<Vec<SemanticChunk>, String> {
2980    if !is_semantic_indexed_extension(file) {
2981        return Err("unsupported file extension".to_string());
2982    }
2983    let lang = detect_language(file).ok_or_else(|| "unsupported file extension".to_string())?;
2984    let source = std::fs::read_to_string(file).map_err(|error| error.to_string())?;
2985    let tree = parser_for(parsers, lang)?
2986        .parse(&source, None)
2987        .ok_or_else(|| format!("tree-sitter parse returned None for {}", file.display()))?;
2988    let symbols =
2989        extract_symbols_from_tree(&source, &tree, lang).map_err(|error| error.to_string())?;
2990
2991    Ok(symbols_to_chunks(file, &symbols, &source, project_root))
2992}
2993
2994/// Build a display snippet from a symbol's source
2995fn build_snippet(symbol: &Symbol, source: &str) -> String {
2996    let lines: Vec<&str> = source.lines().collect();
2997    let start = (symbol.range.start_line as usize).min(lines.len());
2998    // range.end_line is inclusive 0-based; +1 makes it an exclusive slice bound.
2999    let end = (symbol.range.end_line as usize + 1).min(lines.len());
3000    if start < end {
3001        let snippet_lines: Vec<&str> = lines[start..end].iter().take(5).copied().collect();
3002        let mut snippet = snippet_lines.join("\n");
3003        if end - start > 5 {
3004            snippet.push_str("\n  ...");
3005        }
3006        if snippet.len() > 300 {
3007            snippet = format!("{}...", &snippet[..snippet.floor_char_boundary(300)]);
3008        }
3009        snippet
3010    } else {
3011        String::new()
3012    }
3013}
3014
3015/// Convert symbols to semantic chunks with enriched context
3016fn symbols_to_chunks(
3017    file: &Path,
3018    symbols: &[Symbol],
3019    source: &str,
3020    project_root: &Path,
3021) -> Vec<SemanticChunk> {
3022    let mut chunks = Vec::new();
3023    let top_exports_with_signatures = symbols
3024        .iter()
3025        .filter(|symbol| {
3026            symbol.exported
3027                && symbol.parent.is_none()
3028                && !matches!(symbol.kind, SymbolKind::Heading)
3029        })
3030        .map(|symbol| (symbol.name.as_str(), symbol.signature.as_deref()))
3031        .collect::<Vec<_>>();
3032
3033    let has_only_headings = !symbols.is_empty()
3034        && symbols
3035            .iter()
3036            .all(|symbol| matches!(symbol.kind, SymbolKind::Heading));
3037    if top_exports_with_signatures.len() <= 2 && !has_only_headings {
3038        let top_exports = top_exports_with_signatures
3039            .iter()
3040            .map(|(name, _)| *name)
3041            .collect::<Vec<_>>();
3042        let top_export_signatures = top_exports_with_signatures
3043            .iter()
3044            .map(|(_, signature)| *signature)
3045            .collect::<Vec<_>>();
3046        chunks.push(build_file_summary_chunk(
3047            file,
3048            project_root,
3049            source,
3050            &top_exports,
3051            &top_export_signatures,
3052        ));
3053    }
3054
3055    for symbol in symbols {
3056        // Skip Markdown / HTML heading chunks: empirically they dominate result
3057        // lists even for code-shaped queries because heading prose embeds well.
3058        // Agents querying for code lose the actual matches under doc noise.
3059        // README/docs queries are still served by grep on the same files.
3060        if matches!(symbol.kind, SymbolKind::Heading) {
3061            continue;
3062        }
3063
3064        // Skip very small symbols (single-line variables, etc.)
3065        let line_count = symbol
3066            .range
3067            .end_line
3068            .saturating_sub(symbol.range.start_line)
3069            + 1;
3070        if line_count < 2 && !matches!(symbol.kind, SymbolKind::Variable) {
3071            continue;
3072        }
3073
3074        let embed_text = build_embed_text(symbol, source, file, project_root);
3075        let snippet = build_snippet(symbol, source);
3076
3077        chunks.push(SemanticChunk {
3078            file: file.to_path_buf(),
3079            name: symbol.name.clone(),
3080            kind: symbol.kind.clone(),
3081            start_line: symbol.range.start_line,
3082            end_line: symbol.range.end_line,
3083            exported: symbol.exported,
3084            embed_text,
3085            snippet,
3086        });
3087
3088        // Note: Nested symbols are handled separately by the outline system
3089        // Each symbol is indexed individually
3090    }
3091
3092    chunks
3093}
3094
3095/// Cosine similarity between two vectors
3096fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
3097    if a.len() != b.len() {
3098        return 0.0;
3099    }
3100
3101    let mut dot = 0.0f32;
3102    let mut norm_a = 0.0f32;
3103    let mut norm_b = 0.0f32;
3104
3105    for i in 0..a.len() {
3106        dot += a[i] * b[i];
3107        norm_a += a[i] * a[i];
3108        norm_b += b[i] * b[i];
3109    }
3110
3111    let denom = norm_a.sqrt() * norm_b.sqrt();
3112    if denom == 0.0 {
3113        0.0
3114    } else {
3115        dot / denom
3116    }
3117}
3118
3119// Serialization helpers
3120fn symbol_kind_to_u8(kind: &SymbolKind) -> u8 {
3121    match kind {
3122        SymbolKind::Function => 0,
3123        SymbolKind::Class => 1,
3124        SymbolKind::Method => 2,
3125        SymbolKind::Struct => 3,
3126        SymbolKind::Interface => 4,
3127        SymbolKind::Enum => 5,
3128        SymbolKind::TypeAlias => 6,
3129        SymbolKind::Variable => 7,
3130        SymbolKind::Heading => 8,
3131        SymbolKind::FileSummary => 9,
3132    }
3133}
3134
3135fn u8_to_symbol_kind(v: u8) -> SymbolKind {
3136    match v {
3137        0 => SymbolKind::Function,
3138        1 => SymbolKind::Class,
3139        2 => SymbolKind::Method,
3140        3 => SymbolKind::Struct,
3141        4 => SymbolKind::Interface,
3142        5 => SymbolKind::Enum,
3143        6 => SymbolKind::TypeAlias,
3144        7 => SymbolKind::Variable,
3145        8 => SymbolKind::Heading,
3146        9 => SymbolKind::FileSummary,
3147        _ => SymbolKind::Heading,
3148    }
3149}
3150
3151fn read_u32(data: &[u8], pos: &mut usize) -> Result<u32, String> {
3152    if *pos + 4 > data.len() {
3153        return Err("unexpected end of data reading u32".to_string());
3154    }
3155    let val = u32::from_le_bytes([data[*pos], data[*pos + 1], data[*pos + 2], data[*pos + 3]]);
3156    *pos += 4;
3157    Ok(val)
3158}
3159
3160fn read_u64(data: &[u8], pos: &mut usize) -> Result<u64, String> {
3161    if *pos + 8 > data.len() {
3162        return Err("unexpected end of data reading u64".to_string());
3163    }
3164    let bytes: [u8; 8] = data[*pos..*pos + 8].try_into().unwrap();
3165    *pos += 8;
3166    Ok(u64::from_le_bytes(bytes))
3167}
3168
3169fn read_string(data: &[u8], pos: &mut usize) -> Result<String, String> {
3170    let len = read_u32(data, pos)? as usize;
3171    if *pos + len > data.len() {
3172        return Err("unexpected end of data reading string".to_string());
3173    }
3174    let s = String::from_utf8_lossy(&data[*pos..*pos + len]).to_string();
3175    *pos += len;
3176    Ok(s)
3177}
3178
3179#[cfg(test)]
3180mod tests {
3181    use super::*;
3182    use crate::config::{SemanticBackend, SemanticBackendConfig};
3183    use crate::parser::FileParser;
3184    use std::io::{Read, Write};
3185    use std::net::TcpListener;
3186    use std::thread;
3187
3188    #[test]
3189    fn semantic_index_includes_php_inc_and_scss_extensions() {
3190        for file in ["partial.inc", "index.php", "styles.scss"] {
3191            assert!(
3192                is_semantic_indexed_extension(Path::new(file)),
3193                "{file} should be semantic-index eligible"
3194            );
3195        }
3196    }
3197
3198    #[test]
3199    fn transient_marker_round_trips_and_classifies() {
3200        // A marked transient error is recognized and the marker is stripped for
3201        // display, leaving a clean message.
3202        let marked = format!("{TRANSIENT_EMBEDDING_MARKER}openai compatible request failed: error sending request for url (http://localhost:1234/v1/embeddings)");
3203        assert!(embedding_failure_is_transient(&marked));
3204        let clean = strip_transient_embedding_marker(&marked);
3205        assert!(!clean.contains(TRANSIENT_EMBEDDING_MARKER));
3206        assert!(clean.starts_with("openai compatible request failed:"));
3207
3208        // Permanent errors (HTTP 4xx, dimension mismatch) carry no marker and
3209        // are not classified transient — they must fail fast.
3210        for permanent in [
3211            "openai compatible request failed (HTTP 401): Unauthorized",
3212            "embedding dimension mismatch: index has 384, model returned 768",
3213            "too many files (>20000) for semantic indexing (max 20000)",
3214        ] {
3215            assert!(
3216                !embedding_failure_is_transient(permanent),
3217                "{permanent:?} must not be transient"
3218            );
3219            // Stripping a marker-free string is a no-op.
3220            assert_eq!(strip_transient_embedding_marker(permanent), permanent);
3221        }
3222    }
3223
3224    #[test]
3225    fn send_error_transience_separates_connect_timeout_from_4xx() {
3226        // 5xx / 429 are transient; other client errors are not.
3227        assert!(is_retryable_embedding_status(
3228            reqwest::StatusCode::INTERNAL_SERVER_ERROR
3229        ));
3230        assert!(is_retryable_embedding_status(
3231            reqwest::StatusCode::TOO_MANY_REQUESTS
3232        ));
3233        assert!(!is_retryable_embedding_status(
3234            reqwest::StatusCode::UNAUTHORIZED
3235        ));
3236        assert!(!is_retryable_embedding_status(
3237            reqwest::StatusCode::BAD_REQUEST
3238        ));
3239    }
3240
3241    #[test]
3242    fn local_backend_model_loading_body_is_transient() {
3243        // LM Studio / Ollama return a 4xx with a loading/unloaded message while
3244        // the model swaps; these must classify transient so the build self-heals.
3245        for body in [
3246            r#"{"error":"Model was unloaded while the request was still in queue.."}"#,
3247            r#"{"error":"model is loading, please wait"}"#,
3248            r#"{"error":"Model not loaded"}"#,
3249            "Loading model into memory",
3250        ] {
3251            assert!(
3252                embedding_response_body_is_transient(reqwest::StatusCode::BAD_REQUEST, body),
3253                "{body:?} should be body-transient"
3254            );
3255        }
3256
3257        // A genuine 4xx misconfiguration body must NOT be treated as transient,
3258        // even when it happens to contain generic words from the old broad
3259        // substring matcher.
3260        for body in [
3261            r#"{"error":"invalid api key"}"#,
3262            r#"{"error":"model 'foo' not found"}"#,
3263            "Bad Request: unknown field",
3264            "Bad Request: invalid loading model option",
3265            r#"{"error":"unauthorized while model is being loaded by another account"}"#,
3266        ] {
3267            assert!(
3268                !embedding_response_body_is_transient(reqwest::StatusCode::BAD_REQUEST, body),
3269                "{body:?} must not be body-transient"
3270            );
3271        }
3272
3273        assert!(
3274            !embedding_response_body_is_transient(
3275                reqwest::StatusCode::UNAUTHORIZED,
3276                r#"{"error":"model is loading, please wait"}"#
3277            ),
3278            "permanent auth failures must not become transient because of body text"
3279        );
3280    }
3281
3282    fn start_mock_http_server<F>(handler: F) -> (String, thread::JoinHandle<()>)
3283    where
3284        F: Fn(String, String, String) -> String + Send + 'static,
3285    {
3286        let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
3287        let addr = listener.local_addr().expect("local addr");
3288        let handle = thread::spawn(move || {
3289            let (mut stream, _) = listener.accept().expect("accept request");
3290            let mut buf = Vec::new();
3291            let mut chunk = [0u8; 4096];
3292            let mut header_end = None;
3293            let mut content_length = 0usize;
3294            loop {
3295                let n = stream.read(&mut chunk).expect("read request");
3296                if n == 0 {
3297                    break;
3298                }
3299                buf.extend_from_slice(&chunk[..n]);
3300                if header_end.is_none() {
3301                    if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
3302                        header_end = Some(pos + 4);
3303                        let headers = String::from_utf8_lossy(&buf[..pos + 4]);
3304                        for line in headers.lines() {
3305                            if let Some(value) = line.strip_prefix("Content-Length:") {
3306                                content_length = value.trim().parse::<usize>().unwrap_or(0);
3307                            }
3308                        }
3309                    }
3310                }
3311                if let Some(end) = header_end {
3312                    if buf.len() >= end + content_length {
3313                        break;
3314                    }
3315                }
3316            }
3317
3318            let end = header_end.expect("header terminator");
3319            let request = String::from_utf8_lossy(&buf[..end]).to_string();
3320            let body = String::from_utf8_lossy(&buf[end..end + content_length]).to_string();
3321            let mut lines = request.lines();
3322            let request_line = lines.next().expect("request line").to_string();
3323            let path = request_line
3324                .split_whitespace()
3325                .nth(1)
3326                .expect("request path")
3327                .to_string();
3328            let response_body = handler(request_line, path, body);
3329            let response = format!(
3330                "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
3331                response_body.len(),
3332                response_body
3333            );
3334            stream
3335                .write_all(response.as_bytes())
3336                .expect("write response");
3337        });
3338
3339        (format!("http://{}", addr), handle)
3340    }
3341
3342    fn start_truncated_body_server(attempts: usize) -> (String, thread::JoinHandle<()>) {
3343        let listener = TcpListener::bind("127.0.0.1:0").expect("bind truncated test server");
3344        listener
3345            .set_nonblocking(true)
3346            .expect("nonblocking listener");
3347        let addr = listener.local_addr().expect("local addr");
3348        let handle = thread::spawn(move || {
3349            let deadline = std::time::Instant::now() + Duration::from_secs(2);
3350            let mut accepted = 0usize;
3351            while accepted < attempts && std::time::Instant::now() < deadline {
3352                match listener.accept() {
3353                    Ok((mut stream, _)) => {
3354                        accepted += 1;
3355                        let mut buf = [0u8; 4096];
3356                        // The client (under test) uses a 250ms timeout and drops
3357                        // the connection when the truncated body never completes.
3358                        // On Windows that disconnect surfaces as a hard socket
3359                        // error (WSAECONNRESET) on these read/write calls, where
3360                        // Unix returns a clean EOF. Tolerate both: the mock does
3361                        // not need the request bytes, and a write to an
3362                        // already-hung-up client is expected.
3363                        let _ = stream.read(&mut buf);
3364                        let response = "HTTP/1.1 200 OK
3365Content-Type: application/json
3366Content-Length: 128
3367Connection: close
3368
3369{";
3370                        let _ = stream.write_all(response.as_bytes());
3371                    }
3372                    Err(error) if error.kind() == std::io::ErrorKind::WouldBlock => {
3373                        thread::sleep(Duration::from_millis(10));
3374                    }
3375                    Err(error) => panic!("accept request: {error}"),
3376                }
3377            }
3378        });
3379
3380        (format!("http://{}", addr), handle)
3381    }
3382
3383    #[test]
3384    fn response_body_read_failures_are_marked_transient() {
3385        let (url, handle) = start_truncated_body_server(EMBEDDING_REQUEST_MAX_ATTEMPTS);
3386        let client = Client::builder()
3387            .timeout(Duration::from_millis(250))
3388            .build()
3389            .expect("client");
3390
3391        let error = send_embedding_request(|| client.post(&url).body("{}"), "test backend")
3392            .expect_err("truncated body should fail");
3393
3394        handle.join().unwrap();
3395        assert!(
3396            embedding_failure_is_transient(&error),
3397            "body read failures should be transient-marked: {error}"
3398        );
3399        assert!(error.contains("response read failed"));
3400    }
3401
3402    fn test_vector_for_texts(texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
3403        Ok(texts.iter().map(|_| vec![1.0, 0.0, 0.0]).collect())
3404    }
3405
3406    fn write_rust_file(path: &Path, function_name: &str) {
3407        fs::write(
3408            path,
3409            format!("pub fn {function_name}() -> bool {{\n    true\n}}\n"),
3410        )
3411        .unwrap();
3412    }
3413
3414    fn build_test_index(project_root: &Path, files: &[PathBuf]) -> SemanticIndex {
3415        let mut embed = test_vector_for_texts;
3416        SemanticIndex::build(project_root, files, &mut embed, 8).unwrap()
3417    }
3418
3419    fn test_project_root() -> PathBuf {
3420        std::env::current_dir().unwrap()
3421    }
3422
3423    fn set_file_metadata(index: &mut SemanticIndex, file: &Path, mtime: SystemTime, size: u64) {
3424        index.file_mtimes.insert(file.to_path_buf(), mtime);
3425        index.file_sizes.insert(file.to_path_buf(), size);
3426        index
3427            .file_hashes
3428            .insert(file.to_path_buf(), cache_freshness::zero_hash());
3429    }
3430
3431    #[derive(Default)]
3432    struct RecordingEmbedder {
3433        calls: Vec<Vec<String>>,
3434    }
3435
3436    impl RecordingEmbedder {
3437        fn embed(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
3438            let vectors = texts
3439                .iter()
3440                .map(|text| deterministic_test_vector(text))
3441                .collect();
3442            self.calls.push(texts);
3443            Ok(vectors)
3444        }
3445
3446        fn total_embedded_texts(&self) -> usize {
3447            self.calls.iter().map(Vec::len).sum()
3448        }
3449
3450        fn embedded_texts(&self) -> Vec<&str> {
3451            self.calls
3452                .iter()
3453                .flat_map(|batch| batch.iter().map(String::as_str))
3454                .collect()
3455        }
3456    }
3457
3458    fn deterministic_test_vector(text: &str) -> Vec<f32> {
3459        let hash = blake3::hash(text.as_bytes());
3460        let bytes = hash.as_bytes();
3461        vec![
3462            1.0,
3463            bytes[0] as f32 / 255.0,
3464            bytes[1] as f32 / 255.0,
3465            bytes[2] as f32 / 255.0,
3466        ]
3467    }
3468
3469    fn build_recorded_test_index(project_root: &Path, files: &[PathBuf]) -> SemanticIndex {
3470        let mut embedder = RecordingEmbedder::default();
3471        let mut embed = |texts: Vec<String>| embedder.embed(texts);
3472        SemanticIndex::build(project_root, files, &mut embed, 16).unwrap()
3473    }
3474
3475    fn force_stale(index: &mut SemanticIndex, file: &Path) {
3476        set_file_metadata(index, file, SystemTime::UNIX_EPOCH, 0);
3477    }
3478
3479    fn write_source(path: &Path, source: &str) {
3480        if let Some(parent) = path.parent() {
3481            fs::create_dir_all(parent).unwrap();
3482        }
3483        fs::write(path, source).unwrap();
3484    }
3485
3486    fn entries_for_file<'a>(index: &'a SemanticIndex, file: &Path) -> Vec<&'a EmbeddingEntry> {
3487        index
3488            .entries
3489            .iter()
3490            .filter(|entry| entry.chunk.file == file)
3491            .collect()
3492    }
3493
3494    fn entry_by_name<'a>(index: &'a SemanticIndex, file: &Path, name: &str) -> &'a EmbeddingEntry {
3495        index
3496            .entries
3497            .iter()
3498            .find(|entry| entry.chunk.file == file && entry.chunk.name == name)
3499            .unwrap_or_else(|| panic!("missing semantic entry {name} in {}", file.display()))
3500    }
3501
3502    fn file_summary_entry<'a>(index: &'a SemanticIndex, file: &Path) -> &'a EmbeddingEntry {
3503        index
3504            .entries
3505            .iter()
3506            .find(|entry| entry.chunk.file == file && entry.chunk.kind == SymbolKind::FileSummary)
3507            .unwrap_or_else(|| panic!("missing file-summary entry in {}", file.display()))
3508    }
3509
3510    #[test]
3511    fn refresh_stale_line_shift_reuses_all_chunks_and_retains_entries() {
3512        let temp = tempfile::tempdir().unwrap();
3513        let project_root = temp.path();
3514        let file = project_root.join("src/lib.rs");
3515        let original = "pub fn alpha() -> i32 {\n    1\n}\n\npub fn beta() -> i32 {\n    2\n}\n";
3516        write_source(&file, original);
3517
3518        let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
3519        let original_entry_count = index.entries.len();
3520        let original_alpha_vector = entry_by_name(&index, &file, "alpha").vector.clone();
3521
3522        write_source(&file, &format!("\n{original}"));
3523        force_stale(&mut index, &file);
3524
3525        let mut embedder = RecordingEmbedder::default();
3526        let mut embed = |texts: Vec<String>| embedder.embed(texts);
3527        let mut progress = |_done: usize, _total: usize| {};
3528        let summary = index
3529            .refresh_stale_files(
3530                project_root,
3531                std::slice::from_ref(&file),
3532                &mut embed,
3533                16,
3534                &mut progress,
3535            )
3536            .unwrap();
3537
3538        assert_eq!(summary.changed, 1);
3539        assert_eq!(embedder.total_embedded_texts(), 0);
3540        assert_eq!(index.entries.len(), original_entry_count);
3541        let shifted_alpha = entry_by_name(&index, &file, "alpha");
3542        assert_eq!(shifted_alpha.chunk.start_line, 1);
3543        assert_eq!(shifted_alpha.vector, original_alpha_vector);
3544    }
3545
3546    #[test]
3547    fn refresh_invalidated_line_shift_emits_full_replacement_delta_for_apply() {
3548        let temp = tempfile::tempdir().unwrap();
3549        let project_root = temp.path();
3550        let file = project_root.join("src/lib.rs");
3551        let original = "pub fn alpha() -> i32 {\n    1\n}\n\npub fn beta() -> i32 {\n    2\n}\n";
3552        write_source(&file, original);
3553
3554        let mut worker_index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
3555        let mut serving_index = worker_index.clone();
3556        let original_entry_count = worker_index.entries.len();
3557
3558        write_source(&file, &format!("\n{original}"));
3559
3560        let mut embedder = RecordingEmbedder::default();
3561        let mut embed = |texts: Vec<String>| embedder.embed(texts);
3562        let mut progress = |_done: usize, _total: usize| {};
3563        let update = worker_index
3564            .refresh_invalidated_files(
3565                project_root,
3566                std::slice::from_ref(&file),
3567                &mut embed,
3568                16,
3569                100,
3570                &mut progress,
3571            )
3572            .unwrap();
3573
3574        assert_eq!(embedder.total_embedded_texts(), 0);
3575        assert_eq!(update.added_entries.len(), original_entry_count);
3576        assert_eq!(worker_index.entries.len(), original_entry_count);
3577
3578        serving_index.apply_refresh_update(
3579            update.added_entries,
3580            update.updated_metadata,
3581            &update.completed_paths,
3582        );
3583
3584        assert_eq!(serving_index.entries.len(), original_entry_count);
3585        assert_eq!(
3586            entries_for_file(&serving_index, &file).len(),
3587            original_entry_count
3588        );
3589        assert_eq!(
3590            entry_by_name(&serving_index, &file, "alpha")
3591                .chunk
3592                .start_line,
3593            1
3594        );
3595    }
3596
3597    #[test]
3598    fn refresh_invalidated_one_symbol_edit_embeds_only_changed_symbol() {
3599        let temp = tempfile::tempdir().unwrap();
3600        let project_root = temp.path();
3601        let file = project_root.join("src/lib.rs");
3602        write_source(
3603            &file,
3604            "pub fn alpha() -> i32 {\n    1\n}\n\npub fn beta() -> i32 {\n    2\n}\n",
3605        );
3606
3607        let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
3608        let original_entry_count = index.entries.len();
3609        let beta_vector = entry_by_name(&index, &file, "beta").vector.clone();
3610
3611        write_source(
3612            &file,
3613            "pub fn alpha() -> i32 {\n    10\n}\n\npub fn beta() -> i32 {\n    2\n}\n",
3614        );
3615
3616        let mut embedder = RecordingEmbedder::default();
3617        let mut embed = |texts: Vec<String>| embedder.embed(texts);
3618        let mut progress = |_done: usize, _total: usize| {};
3619        let update = index
3620            .refresh_invalidated_files(
3621                project_root,
3622                std::slice::from_ref(&file),
3623                &mut embed,
3624                16,
3625                100,
3626                &mut progress,
3627            )
3628            .unwrap();
3629
3630        assert_eq!(embedder.total_embedded_texts(), 1);
3631        assert!(embedder.embedded_texts()[0].contains("name:alpha"));
3632        assert_eq!(update.added_entries.len(), original_entry_count);
3633        assert_eq!(entry_by_name(&index, &file, "beta").vector, beta_vector);
3634    }
3635
3636    #[test]
3637    fn refresh_reuses_one_old_vector_for_two_byte_identical_symbols() {
3638        let temp = tempfile::tempdir().unwrap();
3639        let project_root = temp.path();
3640        let file = project_root.join("src/dupe.js");
3641        let one_duplicate = "function duplicate() {\n  return 1;\n}\n";
3642        write_source(&file, one_duplicate);
3643
3644        let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
3645        let original_vector = entry_by_name(&index, &file, "duplicate").vector.clone();
3646
3647        write_source(&file, &format!("{one_duplicate}\n{one_duplicate}"));
3648
3649        let mut embedder = RecordingEmbedder::default();
3650        let mut embed = |texts: Vec<String>| embedder.embed(texts);
3651        let mut progress = |_done: usize, _total: usize| {};
3652        index
3653            .refresh_invalidated_files(
3654                project_root,
3655                std::slice::from_ref(&file),
3656                &mut embed,
3657                16,
3658                100,
3659                &mut progress,
3660            )
3661            .unwrap();
3662
3663        let duplicate_entries = index
3664            .entries
3665            .iter()
3666            .filter(|entry| entry.chunk.file == file && entry.chunk.name == "duplicate")
3667            .collect::<Vec<_>>();
3668        assert_eq!(duplicate_entries.len(), 2);
3669        assert_eq!(embedder.total_embedded_texts(), 0);
3670        assert_eq!(duplicate_entries[0].vector, original_vector);
3671        assert_eq!(duplicate_entries[1].vector, original_vector);
3672    }
3673
3674    #[test]
3675    fn file_summary_reuses_on_body_edit_and_misses_on_leading_doc_edit() {
3676        let temp = tempfile::tempdir().unwrap();
3677        let project_root = temp.path();
3678        let file = project_root.join("src/lib.rs");
3679        write_source(
3680            &file,
3681            "//! module docs v1\n\npub fn alpha() -> i32 {\n    1\n}\n",
3682        );
3683
3684        let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
3685        let summary_before = file_summary_entry(&index, &file).vector.clone();
3686
3687        write_source(
3688            &file,
3689            "//! module docs v1\n\npub fn alpha() -> i32 {\n    2\n}\n",
3690        );
3691        let mut body_embedder = RecordingEmbedder::default();
3692        let mut body_embed = |texts: Vec<String>| body_embedder.embed(texts);
3693        let mut progress = |_done: usize, _total: usize| {};
3694        index
3695            .refresh_invalidated_files(
3696                project_root,
3697                std::slice::from_ref(&file),
3698                &mut body_embed,
3699                16,
3700                100,
3701                &mut progress,
3702            )
3703            .unwrap();
3704        assert_eq!(body_embedder.total_embedded_texts(), 1);
3705        assert!(body_embedder.embedded_texts()[0].contains("name:alpha"));
3706        assert_eq!(file_summary_entry(&index, &file).vector, summary_before);
3707
3708        write_source(
3709            &file,
3710            "//! module docs v2\n\npub fn alpha() -> i32 {\n    2\n}\n",
3711        );
3712        let mut doc_embedder = RecordingEmbedder::default();
3713        let mut doc_embed = |texts: Vec<String>| doc_embedder.embed(texts);
3714        index
3715            .refresh_invalidated_files(
3716                project_root,
3717                std::slice::from_ref(&file),
3718                &mut doc_embed,
3719                16,
3720                100,
3721                &mut progress,
3722            )
3723            .unwrap();
3724
3725        assert_eq!(doc_embedder.total_embedded_texts(), 1);
3726        assert!(doc_embedder.embedded_texts()[0].contains("kind:file-summary"));
3727        assert_ne!(file_summary_entry(&index, &file).vector, summary_before);
3728    }
3729
3730    #[test]
3731    fn refresh_invalidated_deleted_file_drops_entries_without_embedding() {
3732        let temp = tempfile::tempdir().unwrap();
3733        let project_root = temp.path();
3734        let file = project_root.join("src/lib.rs");
3735        write_source(&file, "pub fn alpha() -> i32 {\n    1\n}\n");
3736
3737        let mut worker_index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
3738        let mut serving_index = worker_index.clone();
3739        fs::remove_file(&file).unwrap();
3740
3741        let mut embedder = RecordingEmbedder::default();
3742        let mut embed = |texts: Vec<String>| embedder.embed(texts);
3743        let mut progress = |_done: usize, _total: usize| {};
3744        let update = worker_index
3745            .refresh_invalidated_files(
3746                project_root,
3747                std::slice::from_ref(&file),
3748                &mut embed,
3749                16,
3750                100,
3751                &mut progress,
3752            )
3753            .unwrap();
3754
3755        assert_eq!(update.summary.deleted, 1);
3756        assert_eq!(embedder.total_embedded_texts(), 0);
3757        assert!(worker_index.entries.is_empty());
3758
3759        serving_index.apply_refresh_update(
3760            update.added_entries,
3761            update.updated_metadata,
3762            &update.completed_paths,
3763        );
3764        assert!(serving_index.entries.is_empty());
3765    }
3766
3767    #[test]
3768    fn watcher_collect_failure_does_not_resurrect_stale_entries() {
3769        let temp = tempfile::tempdir().unwrap();
3770        let project_root = temp.path();
3771        let file = project_root.join("src/lib.rs");
3772        write_source(&file, "pub fn alpha() -> i32 {\n    1\n}\n");
3773
3774        let mut worker_index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
3775        let mut serving_index = worker_index.clone();
3776        fs::write(&file, [0xff, 0xfe, 0xfd]).unwrap();
3777
3778        let mut embedder = RecordingEmbedder::default();
3779        let mut embed = |texts: Vec<String>| embedder.embed(texts);
3780        let mut progress = |_done: usize, _total: usize| {};
3781        let update = worker_index
3782            .refresh_invalidated_files(
3783                project_root,
3784                std::slice::from_ref(&file),
3785                &mut embed,
3786                16,
3787                100,
3788                &mut progress,
3789            )
3790            .unwrap();
3791
3792        assert_eq!(embedder.total_embedded_texts(), 0);
3793        assert!(update.added_entries.is_empty());
3794        assert!(worker_index.entries.is_empty());
3795        assert!(!worker_index.file_mtimes.contains_key(&file));
3796
3797        serving_index.apply_refresh_update(
3798            update.added_entries,
3799            update.updated_metadata,
3800            &update.completed_paths,
3801        );
3802        assert!(serving_index.entries.is_empty());
3803        assert!(!serving_index.file_mtimes.contains_key(&file));
3804    }
3805
3806    #[test]
3807    fn refresh_invalidated_cap_deferral_remains_file_count_based() {
3808        let temp = tempfile::tempdir().unwrap();
3809        let project_root = temp.path();
3810        let indexed = project_root.join("src/a.rs");
3811        let deferred = project_root.join("src/b.rs");
3812        write_source(&indexed, "pub fn alpha() -> i32 {\n    1\n}\n");
3813        write_source(&deferred, "pub fn beta() -> i32 {\n    2\n}\n");
3814
3815        let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&indexed));
3816        let mut embedder = RecordingEmbedder::default();
3817        let mut embed = |texts: Vec<String>| embedder.embed(texts);
3818        let mut progress = |_done: usize, _total: usize| {};
3819        let update = index
3820            .refresh_invalidated_files(
3821                project_root,
3822                std::slice::from_ref(&deferred),
3823                &mut embed,
3824                16,
3825                1,
3826                &mut progress,
3827            )
3828            .unwrap();
3829
3830        assert_eq!(update.summary.total_processed, 1);
3831        assert_eq!(update.summary.added, 0);
3832        assert_eq!(embedder.total_embedded_texts(), 0);
3833        assert_eq!(index.indexed_file_count(), 1);
3834        assert!(index.deferred_files.contains(&deferred));
3835        assert!(entries_for_file(&index, &deferred).is_empty());
3836    }
3837
3838    #[test]
3839    fn semantic_cache_serialization_skips_paths_outside_project_root() {
3840        let dir = tempfile::tempdir().expect("create temp dir");
3841        let project = fs::canonicalize(dir.path()).expect("canonical project");
3842        let outside = project.join("..").join("outside.rs");
3843        let mut index = SemanticIndex::new(project.clone(), 3);
3844        index
3845            .file_mtimes
3846            .insert(outside.clone(), SystemTime::UNIX_EPOCH);
3847        index.file_sizes.insert(outside.clone(), 1);
3848        index
3849            .file_hashes
3850            .insert(outside.clone(), cache_freshness::zero_hash());
3851        index.entries.push(EmbeddingEntry {
3852            chunk: SemanticChunk {
3853                file: outside,
3854                name: "outside".to_string(),
3855                kind: SymbolKind::Function,
3856                start_line: 0,
3857                end_line: 0,
3858                exported: false,
3859                embed_text: "outside".to_string(),
3860                snippet: "outside".to_string(),
3861            },
3862            vector: vec![1.0, 0.0, 0.0],
3863        });
3864
3865        let bytes = index.to_bytes();
3866        let loaded = SemanticIndex::from_bytes(&bytes, &project).expect("load serialized index");
3867        assert_eq!(loaded.entries.len(), 0);
3868        assert!(loaded.file_mtimes.is_empty());
3869    }
3870
3871    #[test]
3872    fn test_cosine_similarity_identical() {
3873        let a = vec![1.0, 0.0, 0.0];
3874        let b = vec![1.0, 0.0, 0.0];
3875        assert!((cosine_similarity(&a, &b) - 1.0).abs() < 0.001);
3876    }
3877
3878    #[test]
3879    fn test_cosine_similarity_orthogonal() {
3880        let a = vec![1.0, 0.0, 0.0];
3881        let b = vec![0.0, 1.0, 0.0];
3882        assert!(cosine_similarity(&a, &b).abs() < 0.001);
3883    }
3884
3885    #[test]
3886    fn test_cosine_similarity_opposite() {
3887        let a = vec![1.0, 0.0, 0.0];
3888        let b = vec![-1.0, 0.0, 0.0];
3889        assert!((cosine_similarity(&a, &b) + 1.0).abs() < 0.001);
3890    }
3891
3892    #[test]
3893    fn test_serialization_roundtrip() {
3894        let project_root = test_project_root();
3895        let file = project_root.join("src/main.rs");
3896        let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
3897        index.entries.push(EmbeddingEntry {
3898            chunk: SemanticChunk {
3899                file: file.clone(),
3900                name: "handle_request".to_string(),
3901                kind: SymbolKind::Function,
3902                start_line: 10,
3903                end_line: 25,
3904                exported: true,
3905                embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
3906                snippet: "fn handle_request() {\n  // ...\n}".to_string(),
3907            },
3908            vector: vec![0.1, 0.2, 0.3, 0.4],
3909        });
3910        index.dimension = 4;
3911        index
3912            .file_mtimes
3913            .insert(file.clone(), SystemTime::UNIX_EPOCH);
3914        index.file_sizes.insert(file, 0);
3915        index.set_fingerprint(SemanticIndexFingerprint {
3916            backend: "fastembed".to_string(),
3917            model: "all-MiniLM-L6-v2".to_string(),
3918            base_url: FALLBACK_BACKEND.to_string(),
3919            dimension: 4,
3920            chunking_version: default_chunking_version(),
3921        });
3922
3923        let bytes = index.to_bytes();
3924        let restored = SemanticIndex::from_bytes(&bytes, &project_root).unwrap();
3925
3926        assert_eq!(restored.entries.len(), 1);
3927        assert_eq!(restored.entries[0].chunk.name, "handle_request");
3928        assert_eq!(restored.entries[0].vector, vec![0.1, 0.2, 0.3, 0.4]);
3929        assert_eq!(restored.dimension, 4);
3930        assert_eq!(restored.backend_label(), Some("fastembed"));
3931        assert_eq!(restored.model_label(), Some("all-MiniLM-L6-v2"));
3932    }
3933
3934    #[test]
3935    fn symbol_kind_serialization_roundtrip_includes_file_summary_variant() {
3936        let cases = [
3937            (SymbolKind::Function, 0),
3938            (SymbolKind::Class, 1),
3939            (SymbolKind::Method, 2),
3940            (SymbolKind::Struct, 3),
3941            (SymbolKind::Interface, 4),
3942            (SymbolKind::Enum, 5),
3943            (SymbolKind::TypeAlias, 6),
3944            (SymbolKind::Variable, 7),
3945            (SymbolKind::Heading, 8),
3946            (SymbolKind::FileSummary, 9),
3947        ];
3948
3949        for (kind, encoded) in cases {
3950            assert_eq!(symbol_kind_to_u8(&kind), encoded);
3951            assert_eq!(u8_to_symbol_kind(encoded), kind);
3952        }
3953    }
3954
3955    #[test]
3956    fn test_search_top_k() {
3957        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3958        index.dimension = 3;
3959
3960        // Add entries with known vectors
3961        for (i, name) in ["auth", "database", "handler"].iter().enumerate() {
3962            let mut vec = vec![0.0f32; 3];
3963            vec[i] = 1.0; // orthogonal vectors
3964            index.entries.push(EmbeddingEntry {
3965                chunk: SemanticChunk {
3966                    file: PathBuf::from("/src/lib.rs"),
3967                    name: name.to_string(),
3968                    kind: SymbolKind::Function,
3969                    start_line: (i * 10 + 1) as u32,
3970                    end_line: (i * 10 + 5) as u32,
3971                    exported: true,
3972                    embed_text: format!("kind:function name:{}", name),
3973                    snippet: format!("fn {}() {{}}", name),
3974                },
3975                vector: vec,
3976            });
3977        }
3978
3979        // Query aligned with "auth" (index 0)
3980        let query = vec![0.9, 0.1, 0.0];
3981        let results = index.search(&query, 2);
3982
3983        assert_eq!(results.len(), 2);
3984        assert_eq!(results[0].name, "auth"); // highest score
3985        assert!(results[0].score > results[1].score);
3986    }
3987
3988    #[test]
3989    fn test_empty_index_search() {
3990        let index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3991        let results = index.search(&[0.1, 0.2, 0.3], 10);
3992        assert!(results.is_empty());
3993    }
3994
3995    #[test]
3996    fn single_line_symbol_builds_non_empty_snippet() {
3997        let symbol = Symbol {
3998            name: "answer".to_string(),
3999            kind: SymbolKind::Variable,
4000            range: crate::symbols::Range {
4001                start_line: 0,
4002                start_col: 0,
4003                end_line: 0,
4004                end_col: 24,
4005            },
4006            signature: Some("const answer = 42".to_string()),
4007            scope_chain: Vec::new(),
4008            exported: true,
4009            parent: None,
4010        };
4011        let source = "export const answer = 42;\n";
4012
4013        let snippet = build_snippet(&symbol, source);
4014
4015        assert_eq!(snippet, "export const answer = 42;");
4016    }
4017
4018    #[test]
4019    fn optimized_file_chunk_collection_matches_file_parser_path() {
4020        let project_root = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
4021        let file = project_root.join("src/semantic_index.rs");
4022        let source = std::fs::read_to_string(&file).unwrap();
4023
4024        let mut legacy_parser = FileParser::new();
4025        let legacy_symbols = legacy_parser.extract_symbols(&file).unwrap();
4026        let legacy_chunks = symbols_to_chunks(&file, &legacy_symbols, &source, &project_root);
4027
4028        let mut parsers = HashMap::new();
4029        let optimized_chunks = collect_file_chunks(&project_root, &file, &mut parsers).unwrap();
4030
4031        assert_eq!(
4032            chunk_fingerprint(&optimized_chunks),
4033            chunk_fingerprint(&legacy_chunks)
4034        );
4035    }
4036
4037    fn chunk_fingerprint(
4038        chunks: &[SemanticChunk],
4039    ) -> Vec<(String, SymbolKind, u32, u32, bool, String, String)> {
4040        chunks
4041            .iter()
4042            .map(|chunk| {
4043                (
4044                    chunk.name.clone(),
4045                    chunk.kind.clone(),
4046                    chunk.start_line,
4047                    chunk.end_line,
4048                    chunk.exported,
4049                    chunk.embed_text.clone(),
4050                    chunk.snippet.clone(),
4051                )
4052            })
4053            .collect()
4054    }
4055
4056    #[test]
4057    fn rejects_oversized_dimension_during_deserialization() {
4058        let mut bytes = Vec::new();
4059        bytes.push(1u8);
4060        bytes.extend_from_slice(&((MAX_DIMENSION as u32) + 1).to_le_bytes());
4061        bytes.extend_from_slice(&0u32.to_le_bytes());
4062        bytes.extend_from_slice(&0u32.to_le_bytes());
4063
4064        assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
4065    }
4066
4067    #[test]
4068    fn rejects_oversized_entry_count_during_deserialization() {
4069        let mut bytes = Vec::new();
4070        bytes.push(1u8);
4071        bytes.extend_from_slice(&(DEFAULT_DIMENSION as u32).to_le_bytes());
4072        bytes.extend_from_slice(&((MAX_ENTRIES as u32) + 1).to_le_bytes());
4073        bytes.extend_from_slice(&0u32.to_le_bytes());
4074
4075        assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
4076    }
4077
4078    #[test]
4079    fn invalidate_file_removes_entries_and_mtime() {
4080        let target = PathBuf::from("/src/main.rs");
4081        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4082        index.entries.push(EmbeddingEntry {
4083            chunk: SemanticChunk {
4084                file: target.clone(),
4085                name: "main".to_string(),
4086                kind: SymbolKind::Function,
4087                start_line: 0,
4088                end_line: 1,
4089                exported: false,
4090                embed_text: "main".to_string(),
4091                snippet: "fn main() {}".to_string(),
4092            },
4093            vector: vec![1.0; DEFAULT_DIMENSION],
4094        });
4095        index
4096            .file_mtimes
4097            .insert(target.clone(), SystemTime::UNIX_EPOCH);
4098        index.file_sizes.insert(target.clone(), 0);
4099
4100        index.invalidate_file(&target);
4101
4102        assert!(index.entries.is_empty());
4103        assert!(!index.file_mtimes.contains_key(&target));
4104        assert!(!index.file_sizes.contains_key(&target));
4105    }
4106
4107    #[test]
4108    fn refresh_missing_changed_file_is_purged_after_collect() {
4109        let temp = tempfile::tempdir().unwrap();
4110        let project_root = temp.path();
4111        let file = project_root.join("src/lib.rs");
4112        fs::create_dir_all(file.parent().unwrap()).unwrap();
4113        write_rust_file(&file, "vanished_symbol");
4114
4115        let mut index = build_test_index(project_root, std::slice::from_ref(&file));
4116        let original_size = *index.file_sizes.get(&file).unwrap();
4117        set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, original_size + 1);
4118        fs::remove_file(&file).unwrap();
4119
4120        let mut embed = test_vector_for_texts;
4121        let mut progress = |_done: usize, _total: usize| {};
4122        let summary = index
4123            .refresh_stale_files(
4124                project_root,
4125                std::slice::from_ref(&file),
4126                &mut embed,
4127                8,
4128                &mut progress,
4129            )
4130            .unwrap();
4131
4132        assert_eq!(summary.changed, 0);
4133        assert_eq!(summary.added, 0);
4134        assert_eq!(summary.deleted, 1);
4135        assert!(index.entries.is_empty());
4136        assert!(!index.file_mtimes.contains_key(&file));
4137        assert!(!index.file_sizes.contains_key(&file));
4138        assert!(!index.file_hashes.contains_key(&file));
4139    }
4140
4141    #[test]
4142    fn refresh_collect_error_for_existing_path_preserves_cached_entry() {
4143        let temp = tempfile::tempdir().unwrap();
4144        let project_root = temp.path();
4145        let file = project_root.join("src/lib.rs");
4146        fs::create_dir_all(file.parent().unwrap()).unwrap();
4147        write_rust_file(&file, "kept_symbol");
4148
4149        let mut index = build_test_index(project_root, std::slice::from_ref(&file));
4150        let original_entry_count = index.entries.len();
4151        let original_mtime = *index.file_mtimes.get(&file).unwrap();
4152        let original_size = *index.file_sizes.get(&file).unwrap();
4153
4154        let stale_mtime = SystemTime::UNIX_EPOCH;
4155        set_file_metadata(&mut index, &file, stale_mtime, original_size + 1);
4156        fs::remove_file(&file).unwrap();
4157        fs::create_dir(&file).unwrap();
4158
4159        let mut embed = test_vector_for_texts;
4160        let mut progress = |_done: usize, _total: usize| {};
4161        let summary = index
4162            .refresh_stale_files(
4163                project_root,
4164                std::slice::from_ref(&file),
4165                &mut embed,
4166                8,
4167                &mut progress,
4168            )
4169            .unwrap();
4170
4171        assert_eq!(summary.changed, 0);
4172        assert_eq!(summary.added, 0);
4173        assert_eq!(summary.deleted, 0);
4174        assert_eq!(index.entries.len(), original_entry_count);
4175        assert!(index
4176            .entries
4177            .iter()
4178            .any(|entry| entry.chunk.name == "kept_symbol"));
4179        assert_eq!(index.file_mtimes.get(&file), Some(&stale_mtime));
4180        assert_ne!(index.file_mtimes.get(&file), Some(&original_mtime));
4181        assert_eq!(index.file_sizes.get(&file), Some(&(original_size + 1)));
4182    }
4183
4184    #[test]
4185    fn refresh_never_indexed_file_error_does_not_record_mtime() {
4186        let temp = tempfile::tempdir().unwrap();
4187        let project_root = temp.path();
4188        let missing = project_root.join("src/missing.rs");
4189        fs::create_dir_all(missing.parent().unwrap()).unwrap();
4190
4191        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4192        let mut embed = test_vector_for_texts;
4193        let mut progress = |_done: usize, _total: usize| {};
4194        let summary = index
4195            .refresh_stale_files(
4196                project_root,
4197                std::slice::from_ref(&missing),
4198                &mut embed,
4199                8,
4200                &mut progress,
4201            )
4202            .unwrap();
4203
4204        assert_eq!(summary.added, 0);
4205        assert_eq!(summary.changed, 0);
4206        assert_eq!(summary.deleted, 0);
4207        assert!(!index.file_mtimes.contains_key(&missing));
4208        assert!(!index.file_sizes.contains_key(&missing));
4209        assert!(index.entries.is_empty());
4210    }
4211
4212    #[test]
4213    fn refresh_reports_added_for_new_files() {
4214        let temp = tempfile::tempdir().unwrap();
4215        let project_root = temp.path();
4216        let existing = project_root.join("src/lib.rs");
4217        let added = project_root.join("src/new.rs");
4218        fs::create_dir_all(existing.parent().unwrap()).unwrap();
4219        write_rust_file(&existing, "existing_symbol");
4220        write_rust_file(&added, "added_symbol");
4221
4222        let mut index = build_test_index(project_root, std::slice::from_ref(&existing));
4223        let mut embed = test_vector_for_texts;
4224        let mut progress = |_done: usize, _total: usize| {};
4225        let summary = index
4226            .refresh_stale_files(
4227                project_root,
4228                &[existing.clone(), added.clone()],
4229                &mut embed,
4230                8,
4231                &mut progress,
4232            )
4233            .unwrap();
4234
4235        assert_eq!(summary.added, 1);
4236        assert_eq!(summary.changed, 0);
4237        assert_eq!(summary.deleted, 0);
4238        assert_eq!(summary.total_processed, 2);
4239        assert!(index.file_mtimes.contains_key(&added));
4240        assert!(index.entries.iter().any(|entry| entry.chunk.file == added));
4241    }
4242
4243    #[test]
4244    fn refresh_reports_deleted_for_removed_files() {
4245        let temp = tempfile::tempdir().unwrap();
4246        let project_root = temp.path();
4247        let deleted = project_root.join("src/deleted.rs");
4248        fs::create_dir_all(deleted.parent().unwrap()).unwrap();
4249        write_rust_file(&deleted, "deleted_symbol");
4250
4251        let mut index = build_test_index(project_root, std::slice::from_ref(&deleted));
4252        fs::remove_file(&deleted).unwrap();
4253
4254        let mut embed = test_vector_for_texts;
4255        let mut progress = |_done: usize, _total: usize| {};
4256        let summary = index
4257            .refresh_stale_files(project_root, &[], &mut embed, 8, &mut progress)
4258            .unwrap();
4259
4260        assert_eq!(summary.deleted, 1);
4261        assert_eq!(summary.changed, 0);
4262        assert_eq!(summary.added, 0);
4263        assert_eq!(summary.total_processed, 1);
4264        assert!(!index.file_mtimes.contains_key(&deleted));
4265        assert!(index.entries.is_empty());
4266    }
4267
4268    #[test]
4269    fn refresh_reports_changed_for_modified_files() {
4270        let temp = tempfile::tempdir().unwrap();
4271        let project_root = temp.path();
4272        let file = project_root.join("src/lib.rs");
4273        fs::create_dir_all(file.parent().unwrap()).unwrap();
4274        write_rust_file(&file, "old_symbol");
4275
4276        let mut index = build_test_index(project_root, std::slice::from_ref(&file));
4277        set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, 0);
4278        write_rust_file(&file, "new_symbol");
4279
4280        let mut embed = test_vector_for_texts;
4281        let mut progress = |_done: usize, _total: usize| {};
4282        let summary = index
4283            .refresh_stale_files(
4284                project_root,
4285                std::slice::from_ref(&file),
4286                &mut embed,
4287                8,
4288                &mut progress,
4289            )
4290            .unwrap();
4291
4292        assert_eq!(summary.changed, 1);
4293        assert_eq!(summary.added, 0);
4294        assert_eq!(summary.deleted, 0);
4295        assert_eq!(summary.total_processed, 1);
4296        assert!(index
4297            .entries
4298            .iter()
4299            .any(|entry| entry.chunk.name == "new_symbol"));
4300        assert!(!index
4301            .entries
4302            .iter()
4303            .any(|entry| entry.chunk.name == "old_symbol"));
4304    }
4305
4306    #[test]
4307    fn refresh_all_clean_reports_zero_counts_and_no_embedding_work() {
4308        let temp = tempfile::tempdir().unwrap();
4309        let project_root = temp.path();
4310        let file = project_root.join("src/lib.rs");
4311        fs::create_dir_all(file.parent().unwrap()).unwrap();
4312        write_rust_file(&file, "clean_symbol");
4313
4314        let mut index = build_test_index(project_root, std::slice::from_ref(&file));
4315        let original_entries = index.entries.len();
4316        let mut embed_called = false;
4317        let mut embed = |texts: Vec<String>| {
4318            embed_called = true;
4319            test_vector_for_texts(texts)
4320        };
4321        let mut progress = |_done: usize, _total: usize| {};
4322        let summary = index
4323            .refresh_stale_files(
4324                project_root,
4325                std::slice::from_ref(&file),
4326                &mut embed,
4327                8,
4328                &mut progress,
4329            )
4330            .unwrap();
4331
4332        assert!(summary.is_noop());
4333        assert_eq!(summary.total_processed, 1);
4334        assert!(!embed_called);
4335        assert_eq!(index.entries.len(), original_entries);
4336    }
4337
4338    #[test]
4339    fn detects_missing_onnx_runtime_from_dynamic_load_error() {
4340        let message = "Failed to load ONNX Runtime shared library libonnxruntime.dylib via dlopen: no such file";
4341
4342        assert!(is_onnx_runtime_unavailable(message));
4343    }
4344
4345    #[test]
4346    fn formats_missing_onnx_runtime_with_install_hint() {
4347        let message = format_embedding_init_error(
4348            "Failed to load ONNX Runtime shared library libonnxruntime.so via dlopen: no such file",
4349        );
4350
4351        assert!(message.starts_with("ONNX Runtime not found. Install via:"));
4352        assert!(message.contains("Original error:"));
4353    }
4354
4355    #[test]
4356    fn openai_compatible_backend_embeds_with_mock_server() {
4357        let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
4358            assert!(request_line.starts_with("POST "));
4359            assert_eq!(path, "/v1/embeddings");
4360            "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0},{\"embedding\":[0.4,0.5,0.6],\"index\":1}]}".to_string()
4361        });
4362
4363        let config = SemanticBackendConfig {
4364            backend: SemanticBackend::OpenAiCompatible,
4365            model: "test-embedding".to_string(),
4366            base_url: Some(base_url),
4367            api_key_env: None,
4368            timeout_ms: 5_000,
4369            max_batch_size: 64,
4370            max_files: 20_000,
4371        };
4372
4373        let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
4374        let vectors = model
4375            .embed(vec!["hello".to_string(), "world".to_string()])
4376            .unwrap();
4377
4378        assert_eq!(vectors, vec![vec![0.1, 0.2, 0.3], vec![0.4, 0.5, 0.6]]);
4379        handle.join().unwrap();
4380    }
4381
4382    /// Regression for issue #36: AFT was sending TWO Content-Type headers
4383    /// on the OpenAI embeddings request — once implicitly via `.json(&body)`
4384    /// and again explicitly via `.header("Content-Type", "application/json")`.
4385    /// reqwest's `.header()` calls `HeaderMap::append`, which produces two
4386    /// headers on the wire. OpenAI's /v1/embeddings endpoint rejects that
4387    /// with `HTTP 400 "you must provide a model parameter"` even though the
4388    /// body actually contains `model`. The fix is to drop the explicit
4389    /// `.header("Content-Type", ...)` call. This test pins that we send
4390    /// exactly one Content-Type header.
4391    #[test]
4392    fn openai_compatible_request_has_single_content_type_header() {
4393        use std::sync::{Arc, Mutex};
4394        let captured: Arc<Mutex<Vec<u8>>> = Arc::new(Mutex::new(Vec::new()));
4395        let captured_for_thread = Arc::clone(&captured);
4396
4397        let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
4398        let addr = listener.local_addr().expect("local addr");
4399        let handle = thread::spawn(move || {
4400            let (mut stream, _) = listener.accept().expect("accept");
4401            let mut buf = Vec::new();
4402            let mut chunk = [0u8; 4096];
4403            let mut header_end = None;
4404            let mut content_length = 0usize;
4405            loop {
4406                let n = stream.read(&mut chunk).expect("read");
4407                if n == 0 {
4408                    break;
4409                }
4410                buf.extend_from_slice(&chunk[..n]);
4411                if header_end.is_none() {
4412                    if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
4413                        header_end = Some(pos + 4);
4414                        for line in String::from_utf8_lossy(&buf[..pos + 4]).lines() {
4415                            if let Some(value) = line.strip_prefix("Content-Length:") {
4416                                content_length = value.trim().parse::<usize>().unwrap_or(0);
4417                            }
4418                        }
4419                    }
4420                }
4421                if let Some(end) = header_end {
4422                    if buf.len() >= end + content_length {
4423                        break;
4424                    }
4425                }
4426            }
4427            *captured_for_thread.lock().unwrap() = buf;
4428            let body = "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0}]}";
4429            let response = format!(
4430                "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
4431                body.len(),
4432                body
4433            );
4434            let _ = stream.write_all(response.as_bytes());
4435        });
4436
4437        let config = SemanticBackendConfig {
4438            backend: SemanticBackend::OpenAiCompatible,
4439            model: "text-embedding-3-small".to_string(),
4440            base_url: Some(format!("http://{}", addr)),
4441            api_key_env: None,
4442            timeout_ms: 5_000,
4443            max_batch_size: 64,
4444            max_files: 20_000,
4445        };
4446        let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
4447        let _ = model.embed(vec!["probe".to_string()]).unwrap();
4448        handle.join().unwrap();
4449
4450        let bytes = captured.lock().unwrap().clone();
4451        let request = String::from_utf8_lossy(&bytes);
4452
4453        // Lowercase line counts because HTTP headers are case-insensitive
4454        // and reqwest may emit `content-type` in lowercase under HTTP/2.
4455        let content_type_lines = request
4456            .lines()
4457            .filter(|line| {
4458                let lower = line.to_ascii_lowercase();
4459                lower.starts_with("content-type:")
4460            })
4461            .count();
4462        assert_eq!(
4463            content_type_lines, 1,
4464            "expected exactly one Content-Type header but found {content_type_lines}; full request:\n{request}",
4465        );
4466
4467        // The body must still include the model field — pin this so a future
4468        // change can't accidentally drop `model` while fixing duplicate headers.
4469        assert!(
4470            request.contains(r#""model":"text-embedding-3-small""#),
4471            "request body should contain model field; full request:\n{request}",
4472        );
4473    }
4474
4475    #[test]
4476    fn ollama_backend_embeds_with_mock_server() {
4477        let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
4478            assert!(request_line.starts_with("POST "));
4479            assert_eq!(path, "/api/embed");
4480            "{\"embeddings\":[[0.7,0.8,0.9],[1.0,1.1,1.2]]}".to_string()
4481        });
4482
4483        let config = SemanticBackendConfig {
4484            backend: SemanticBackend::Ollama,
4485            model: "embeddinggemma".to_string(),
4486            base_url: Some(base_url),
4487            api_key_env: None,
4488            timeout_ms: 5_000,
4489            max_batch_size: 64,
4490            max_files: 20_000,
4491        };
4492
4493        let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
4494        let vectors = model
4495            .embed(vec!["hello".to_string(), "world".to_string()])
4496            .unwrap();
4497
4498        assert_eq!(vectors, vec![vec![0.7, 0.8, 0.9], vec![1.0, 1.1, 1.2]]);
4499        handle.join().unwrap();
4500    }
4501
4502    #[test]
4503    fn read_from_disk_rejects_fingerprint_mismatch() {
4504        let storage = tempfile::tempdir().unwrap();
4505        let project_key = "proj";
4506
4507        let project_root = test_project_root();
4508        let file = project_root.join("src/main.rs");
4509        let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
4510        index.entries.push(EmbeddingEntry {
4511            chunk: SemanticChunk {
4512                file: file.clone(),
4513                name: "handle_request".to_string(),
4514                kind: SymbolKind::Function,
4515                start_line: 10,
4516                end_line: 25,
4517                exported: true,
4518                embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
4519                snippet: "fn handle_request() {}".to_string(),
4520            },
4521            vector: vec![0.1, 0.2, 0.3],
4522        });
4523        index.dimension = 3;
4524        index
4525            .file_mtimes
4526            .insert(file.clone(), SystemTime::UNIX_EPOCH);
4527        index.file_sizes.insert(file, 0);
4528        index.set_fingerprint(SemanticIndexFingerprint {
4529            backend: "openai_compatible".to_string(),
4530            model: "test-embedding".to_string(),
4531            base_url: "http://127.0.0.1:1234/v1".to_string(),
4532            dimension: 3,
4533            chunking_version: default_chunking_version(),
4534        });
4535        index.write_to_disk(storage.path(), project_key);
4536
4537        let matching = index.fingerprint().unwrap().as_string();
4538        assert!(SemanticIndex::read_from_disk(
4539            storage.path(),
4540            project_key,
4541            &project_root,
4542            false,
4543            Some(&matching),
4544        )
4545        .is_some());
4546
4547        let mismatched = SemanticIndexFingerprint {
4548            backend: "ollama".to_string(),
4549            model: "embeddinggemma".to_string(),
4550            base_url: "http://127.0.0.1:11434".to_string(),
4551            dimension: 3,
4552            chunking_version: default_chunking_version(),
4553        }
4554        .as_string();
4555        assert!(SemanticIndex::read_from_disk(
4556            storage.path(),
4557            project_key,
4558            &project_root,
4559            false,
4560            Some(&mismatched),
4561        )
4562        .is_none());
4563    }
4564
4565    #[test]
4566    fn read_from_disk_rejects_v3_cache_for_snippet_rebuild() {
4567        let storage = tempfile::tempdir().unwrap();
4568        let project_key = "proj-v3";
4569        let dir = storage.path().join("semantic").join(project_key);
4570        fs::create_dir_all(&dir).unwrap();
4571
4572        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4573        index.entries.push(EmbeddingEntry {
4574            chunk: SemanticChunk {
4575                file: PathBuf::from("/src/main.rs"),
4576                name: "handle_request".to_string(),
4577                kind: SymbolKind::Function,
4578                start_line: 0,
4579                end_line: 0,
4580                exported: true,
4581                embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
4582                snippet: "fn handle_request() {}".to_string(),
4583            },
4584            vector: vec![0.1, 0.2, 0.3],
4585        });
4586        index.dimension = 3;
4587        index
4588            .file_mtimes
4589            .insert(PathBuf::from("/src/main.rs"), SystemTime::UNIX_EPOCH);
4590        index.file_sizes.insert(PathBuf::from("/src/main.rs"), 0);
4591        let fingerprint = SemanticIndexFingerprint {
4592            backend: "fastembed".to_string(),
4593            model: "test".to_string(),
4594            base_url: FALLBACK_BACKEND.to_string(),
4595            dimension: 3,
4596            chunking_version: default_chunking_version(),
4597        };
4598        index.set_fingerprint(fingerprint.clone());
4599
4600        let mut bytes = index.to_bytes();
4601        bytes[0] = SEMANTIC_INDEX_VERSION_V3;
4602        fs::write(dir.join("semantic.bin"), bytes).unwrap();
4603
4604        assert!(SemanticIndex::read_from_disk(
4605            storage.path(),
4606            project_key,
4607            &test_project_root(),
4608            false,
4609            Some(&fingerprint.as_string())
4610        )
4611        .is_none());
4612        assert!(!dir.join("semantic.bin").exists());
4613    }
4614
4615    fn make_symbol(kind: SymbolKind, name: &str, start: u32, end: u32) -> crate::symbols::Symbol {
4616        crate::symbols::Symbol {
4617            name: name.to_string(),
4618            kind,
4619            range: crate::symbols::Range {
4620                start_line: start,
4621                start_col: 0,
4622                end_line: end,
4623                end_col: 0,
4624            },
4625            signature: None,
4626            scope_chain: Vec::new(),
4627            exported: false,
4628            parent: None,
4629        }
4630    }
4631
4632    /// Heading symbols (Markdown / HTML headings) must NOT be indexed —
4633    /// they overwhelmingly dominated semantic results even on code-shaped
4634    /// queries because heading prose embeds far more strongly than code
4635    /// chunks. Skipping headings keeps aft_search a code-finder.
4636    #[test]
4637    fn symbols_to_chunks_skips_heading_symbols() {
4638        let project_root = PathBuf::from("/proj");
4639        let file = project_root.join("README.md");
4640        let source = "# Title\n\nbody text\n\n## Section\n\nmore text\n";
4641
4642        let symbols = vec![
4643            make_symbol(SymbolKind::Heading, "Title", 0, 2),
4644            make_symbol(SymbolKind::Heading, "Section", 4, 6),
4645        ];
4646
4647        let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
4648        assert!(
4649            chunks.is_empty(),
4650            "Heading symbols must be filtered out before embedding; got {} chunk(s)",
4651            chunks.len()
4652        );
4653    }
4654
4655    /// A symbol with an enormous signature (e.g. a YAML/Kubernetes CronJob
4656    /// whose inline `command:` script is parsed into the signature) must not
4657    /// produce an embed_text that overflows the embedding backend's physical
4658    /// batch. Before the clamp, the unbounded `signature:` append created a
4659    /// multi-KB input that aborted the whole index build and degraded every
4660    /// search to lexical-only.
4661    #[test]
4662    fn build_embed_text_clamps_oversized_signature() {
4663        let project_root = PathBuf::from("/proj");
4664        let file = project_root.join("cronjob.yaml");
4665        let huge_sig = "kubectl ".repeat(2000); // ~16 KB
4666        let source = "apiVersion: batch/v1\nkind: CronJob\n";
4667
4668        let mut symbol = make_symbol(SymbolKind::Class, "cluster-janitor", 0, 1);
4669        symbol.signature = Some(huge_sig);
4670
4671        let text = build_embed_text(&symbol, source, &file, &project_root);
4672        assert!(
4673            text.chars().count() <= MAX_EMBED_TEXT_CHARS,
4674            "embed_text must be clamped to {} chars, got {}",
4675            MAX_EMBED_TEXT_CHARS,
4676            text.chars().count()
4677        );
4678    }
4679
4680    /// Code symbols (functions, classes, methods, structs, etc.) must still
4681    /// be indexed alongside the heading skip — otherwise we'd starve the
4682    /// index entirely.
4683    #[test]
4684    fn symbols_to_chunks_keeps_code_symbols_alongside_skipped_headings() {
4685        let project_root = PathBuf::from("/proj");
4686        let file = project_root.join("src/lib.rs");
4687        let source = "pub fn handle_request() -> bool {\n    true\n}\n";
4688
4689        let symbols = vec![
4690            // A heading mixed in (e.g. from a doc comment block elsewhere).
4691            make_symbol(SymbolKind::Heading, "doc heading", 0, 1),
4692            make_symbol(SymbolKind::Function, "handle_request", 0, 2),
4693            make_symbol(SymbolKind::Struct, "AuthService", 4, 6),
4694        ];
4695
4696        let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
4697        assert_eq!(
4698            chunks.len(),
4699            3,
4700            "Expected file-summary + 2 code chunks (Function + Struct), got {}",
4701            chunks.len()
4702        );
4703        let names: Vec<&str> = chunks.iter().map(|c| c.name.as_str()).collect();
4704        assert!(chunks
4705            .iter()
4706            .any(|chunk| matches!(chunk.kind, SymbolKind::FileSummary)));
4707        assert!(names.contains(&"handle_request"));
4708        assert!(names.contains(&"AuthService"));
4709        assert!(
4710            !names.contains(&"doc heading"),
4711            "Heading symbol leaked into chunks: {names:?}"
4712        );
4713    }
4714
4715    #[test]
4716    fn validate_ssrf_allows_loopback_hostnames() {
4717        // Loopback hostnames are explicitly allowed so self-hosted backends
4718        // (Ollama at http://localhost:11434) work at their default config.
4719        for host in &[
4720            "http://localhost",
4721            "http://localhost:8080",
4722            "http://localhost:11434", // Ollama default
4723            "http://localhost.localdomain",
4724            "http://foo.localhost",
4725        ] {
4726            assert!(
4727                validate_base_url_no_ssrf(host).is_ok(),
4728                "Expected {host} to be allowed (loopback), got: {:?}",
4729                validate_base_url_no_ssrf(host)
4730            );
4731        }
4732    }
4733
4734    #[test]
4735    fn validate_ssrf_allows_loopback_ips() {
4736        // 127.0.0.0/8 is loopback — by definition same-machine and not an
4737        // SSRF target. Allow it so Ollama at http://127.0.0.1:11434 works.
4738        for url in &[
4739            "http://127.0.0.1",
4740            "http://127.0.0.1:11434", // Ollama default
4741            "http://127.0.0.1:8080",
4742            "http://127.1.2.3",
4743        ] {
4744            let result = validate_base_url_no_ssrf(url);
4745            assert!(
4746                result.is_ok(),
4747                "Expected {url} to be allowed (loopback), got: {:?}",
4748                result
4749            );
4750        }
4751    }
4752
4753    #[test]
4754    fn validate_ssrf_rejects_private_non_loopback_ips() {
4755        // Non-loopback private/reserved IPs remain rejected — homelab/intranet
4756        // services on LAN IPs are real SSRF targets even though the user
4757        // configured them. Users who want this can opt in by binding the
4758        // service to a public-routable address.
4759        for url in &[
4760            "http://192.168.1.1",
4761            "http://10.0.0.1",
4762            "http://172.16.0.1",
4763            "http://169.254.169.254",
4764            "http://100.64.0.1",
4765        ] {
4766            let result = validate_base_url_no_ssrf(url);
4767            assert!(
4768                result.is_err(),
4769                "Expected {url} to be rejected (non-loopback private), got: {:?}",
4770                result
4771            );
4772        }
4773    }
4774
4775    #[test]
4776    fn validate_ssrf_rejects_mdns_local_hostnames() {
4777        // mDNS .local hostnames typically resolve to LAN devices, not
4778        // loopback. Rejecting them before DNS lookup gives a clearer error.
4779        for host in &[
4780            "http://printer.local",
4781            "http://nas.local:8080",
4782            "http://homelab.local",
4783        ] {
4784            let result = validate_base_url_no_ssrf(host);
4785            assert!(
4786                result.is_err(),
4787                "Expected {host} to be rejected (mDNS), got: {:?}",
4788                result
4789            );
4790        }
4791    }
4792
4793    #[test]
4794    fn normalize_base_url_allows_localhost_for_tests() {
4795        // normalize_base_url itself should NOT block localhost — only
4796        // validate_base_url_no_ssrf does. Tests construct backends directly.
4797        assert!(normalize_base_url("http://127.0.0.1:9999").is_ok());
4798        assert!(normalize_base_url("http://localhost:8080").is_ok());
4799    }
4800
4801    /// Pin the user-facing wording of the ONNX version-mismatch error.
4802    /// The auto-fix path MUST be listed first because it's the only safe
4803    /// option that doesn't require sudo or risk breaking other apps that
4804    /// link the system library. Regression of any of these strings would
4805    /// either mislead users (system rm before auto-fix) or break the
4806    /// `aft doctor --fix` discovery path.
4807    #[test]
4808    fn ort_mismatch_message_recommends_auto_fix_first() {
4809        let msg =
4810            format_ort_version_mismatch("1.9.0", "/usr/lib/x86_64-linux-gnu/libonnxruntime.so");
4811
4812        // The reported version and path must appear verbatim.
4813        assert!(
4814            msg.contains("v1.9.0"),
4815            "should report detected version: {msg}"
4816        );
4817        assert!(
4818            msg.contains("/usr/lib/x86_64-linux-gnu/libonnxruntime.so"),
4819            "should report system path: {msg}"
4820        );
4821        assert!(msg.contains("v1.20+"), "should state requirement: {msg}");
4822
4823        // Solution ordering: auto-fix is #1, system rm is #2, install is #3.
4824        let auto_fix_pos = msg
4825            .find("Auto-fix")
4826            .expect("Auto-fix solution missing — users won't discover --fix");
4827        let remove_pos = msg
4828            .find("Remove the old library")
4829            .expect("system-rm solution missing");
4830        assert!(
4831            auto_fix_pos < remove_pos,
4832            "Auto-fix must come before manual rm — see PR comment thread"
4833        );
4834
4835        // The auto-fix command must be runnable as-is on a fresh system.
4836        assert!(
4837            msg.contains("npx @cortexkit/aft doctor --fix"),
4838            "auto-fix command must be present and copy-pasteable: {msg}"
4839        );
4840    }
4841
4842    #[cfg(any(target_os = "linux", target_os = "macos"))]
4843    #[test]
4844    fn loaded_ort_version_detection_prefers_actual_loaded_library_path() {
4845        let requested = "libonnxruntime.so";
4846        let actual = "/usr/local/lib/libonnxruntime.so.1.19.0";
4847
4848        assert_eq!(detect_ort_version_from_path(requested), None);
4849        let (version, source) =
4850            detect_ort_version_from_resolved_or_requested(Some(actual.to_string()), requested);
4851
4852        assert_eq!(version, Some("1.19.0".to_string()));
4853        assert_eq!(source, actual);
4854
4855        let msg = format_ort_version_mismatch(&version.unwrap(), &source);
4856        assert!(msg.contains("v1.19.0"));
4857        assert!(msg.contains(actual));
4858    }
4859
4860    /// macOS dylib paths must not produce a malformed message when the
4861    /// system path lacks a trailing slash. This is a regression guard
4862    /// for the "{}\n{}" format string contract.
4863    #[test]
4864    fn ort_mismatch_message_handles_macos_dylib_path() {
4865        let msg = format_ort_version_mismatch("1.9.0", "/opt/homebrew/lib/libonnxruntime.dylib");
4866        assert!(msg.contains("v1.9.0"));
4867        assert!(msg.contains("/opt/homebrew/lib/libonnxruntime.dylib"));
4868        // The dylib path must appear in the auto-fix paragraph (single
4869        // quotes around it) AND in the manual-rm paragraph; verify
4870        // both placements survived the format string.
4871        assert!(
4872            msg.contains("'/opt/homebrew/lib/libonnxruntime.dylib'"),
4873            "system path should be quoted in the auto-fix sentence: {msg}"
4874        );
4875    }
4876}