Skip to main content

aft/
semantic_index.rs

1use crate::cache_freshness::{self, FileFreshness, FreshnessVerdict};
2use crate::config::{SemanticBackend, SemanticBackendConfig};
3use crate::fs_lock;
4use crate::parser::{detect_language, extract_symbols_from_tree, grammar_for};
5use crate::search_index::{cache_relative_path, cached_path_under_root};
6use crate::symbols::{Symbol, SymbolKind};
7use crate::{slog_info, slog_warn};
8
9use crate::local_embed::LocalEmbedder;
10use rayon::prelude::*;
11use reqwest::blocking::Client;
12use serde::{Deserialize, Serialize};
13use std::collections::{HashMap, HashSet, VecDeque};
14use std::env;
15use std::fmt::Display;
16use std::fs;
17use std::io::{self, BufReader, BufWriter, Cursor, Read, Write};
18use std::path::{Path, PathBuf};
19use std::sync::Mutex;
20use std::time::Duration;
21use std::time::SystemTime;
22use tree_sitter::Parser;
23use url::Url;
24
25const DEFAULT_DIMENSION: usize = 384;
26const MAX_ENTRIES: usize = 1_000_000;
27// Covers high-dimensional backends such as OpenAI text-embedding-3-large (3072)
28// and common local models (4096) while keeping a bounded supported shape.
29const MAX_DIMENSION: usize = 4096;
30const F32_BYTES: usize = std::mem::size_of::<f32>();
31const HEADER_BYTES_V1: usize = 9;
32const HEADER_BYTES_V2: usize = 13;
33const ONNX_RUNTIME_INSTALL_HINT: &str =
34    "ONNX Runtime not found. Install via: brew install onnxruntime (macOS), \
35     apt install libonnxruntime (Linux), or place onnxruntime.dll in your PATH (Windows). \
36     AFT can auto-download ONNX Runtime — run `npx @cortexkit/aft doctor` to diagnose.";
37
38const SEMANTIC_INDEX_VERSION_V1: u8 = 1;
39const SEMANTIC_INDEX_VERSION_V2: u8 = 2;
40/// V3 adds subsec_nanos to the file-mtime table so staleness detection survives
41/// restart round-trips on filesystems with subsecond mtime precision (APFS,
42/// ext4 with nsec, NTFS). V1/V2 persisted whole-second mtimes only, which
43/// caused every restart to flag ~99% of files as stale and re-embed them.
44const SEMANTIC_INDEX_VERSION_V3: u8 = 3;
45/// V4 keeps the V3 on-disk layout but rebuilds persisted snippets once after
46/// fixing symbol ranges that were incorrectly treated as 1-based.
47const SEMANTIC_INDEX_VERSION_V4: u8 = 4;
48/// V5 adds file sizes to the file metadata table so incremental staleness
49/// detection can catch content changes even when mtime precision misses them.
50const SEMANTIC_INDEX_VERSION_V5: u8 = 5;
51/// V6 stores paths relative to project_root and adds content hashes.
52const SEMANTIC_INDEX_VERSION_V6: u8 = 6;
53const DEFAULT_OPENAI_EMBEDDING_PATH: &str = "/embeddings";
54const DEFAULT_OLLAMA_EMBEDDING_PATH: &str = "/api/embed";
55// Must stay below the bridge timeout (30s) to avoid bridge kills on slow backends.
56const DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS: u64 = 25_000;
57const DEFAULT_MAX_BATCH_SIZE: usize = 64;
58const QUERY_EMBEDDING_CACHE_CAP: usize = 1_000;
59const FALLBACK_BACKEND: &str = "none";
60const EMBEDDING_REQUEST_MAX_ATTEMPTS: usize = 3;
61const EMBEDDING_REQUEST_BACKOFF_MS: [u64; 2] = [500, 1_000];
62static SEMANTIC_LOCK_ACQUIRE_MUTEX: Mutex<()> = Mutex::new(());
63
64pub struct SemanticIndexLock {
65    _guard: fs_lock::LockGuard,
66}
67
68impl SemanticIndexLock {
69    pub fn acquire(storage_dir: &Path, project_key: &str) -> std::io::Result<Self> {
70        let dir = storage_dir.join("semantic").join(project_key);
71        fs::create_dir_all(&dir)?;
72        let path = dir.join("cache.lock");
73        let _acquire_guard = SEMANTIC_LOCK_ACQUIRE_MUTEX
74            .lock()
75            .map_err(|_| std::io::Error::other("semantic cache lock acquisition mutex poisoned"))?;
76        fs_lock::try_acquire(&path, Duration::from_secs(2))
77            .map(|guard| Self { _guard: guard })
78            .map_err(|error| match error {
79                fs_lock::AcquireError::Timeout => {
80                    std::io::Error::other("timed out acquiring semantic cache lock")
81                }
82                fs_lock::AcquireError::Io(error) => error,
83            })
84    }
85}
86
87#[derive(Debug, Clone, Serialize, Deserialize)]
88pub struct SemanticIndexFingerprint {
89    pub backend: String,
90    pub model: String,
91    #[serde(default)]
92    pub base_url: String,
93    pub dimension: usize,
94    #[serde(default = "default_chunking_version")]
95    pub chunking_version: u32,
96}
97
98fn default_chunking_version() -> u32 {
99    2
100}
101
102impl SemanticIndexFingerprint {
103    fn from_config(config: &SemanticBackendConfig, dimension: usize) -> Self {
104        // Use normalized URL for fingerprinting so cosmetic differences
105        // (e.g. "http://host/v1" vs "http://host/v1/") don't cause rebuilds.
106        let base_url = config
107            .base_url
108            .as_ref()
109            .and_then(|u| normalize_base_url(u).ok())
110            .unwrap_or_else(|| FALLBACK_BACKEND.to_string());
111        Self {
112            backend: config.backend.as_str().to_string(),
113            model: config.model.clone(),
114            base_url,
115            dimension,
116            chunking_version: default_chunking_version(),
117        }
118    }
119
120    pub fn as_string(&self) -> String {
121        serde_json::to_string(self).unwrap_or_else(|_| String::new())
122    }
123
124    fn matches_expected(&self, expected: &str) -> bool {
125        let encoded = self.as_string();
126        !encoded.is_empty() && encoded == expected
127    }
128}
129
130enum SemanticEmbeddingEngine {
131    /// Local ONNX embedder (all-MiniLM-L6-v2 via raw `ort`). The config-facing
132    /// backend string stays "fastembed" for index-fingerprint compatibility.
133    Local(LocalEmbedder),
134    OpenAiCompatible {
135        client: Client,
136        model: String,
137        base_url: String,
138        api_key: Option<String>,
139    },
140    Ollama {
141        client: Client,
142        model: String,
143        base_url: String,
144    },
145}
146
147pub struct SemanticEmbeddingModel {
148    backend: SemanticBackend,
149    model: String,
150    base_url: Option<String>,
151    timeout_ms: u64,
152    max_batch_size: usize,
153    dimension: Option<usize>,
154    engine: SemanticEmbeddingEngine,
155    query_embedding_cache: HashMap<String, Vec<f32>>,
156    query_embedding_cache_order: VecDeque<String>,
157    query_embedding_cache_hits: u64,
158    query_embedding_cache_misses: u64,
159}
160
161pub type EmbeddingModel = SemanticEmbeddingModel;
162
163fn validate_embedding_batch(
164    vectors: &[Vec<f32>],
165    expected_count: usize,
166    context: &str,
167) -> Result<(), String> {
168    if expected_count > 0 && vectors.is_empty() {
169        return Err(format!(
170            "{context} returned no vectors for {expected_count} inputs"
171        ));
172    }
173
174    if vectors.len() != expected_count {
175        return Err(format!(
176            "{context} returned {} vectors for {} inputs",
177            vectors.len(),
178            expected_count
179        ));
180    }
181
182    let Some(first_vector) = vectors.first() else {
183        return Ok(());
184    };
185    let expected_dimension = first_vector.len();
186    validate_embedding_dimension(expected_dimension)
187        .map_err(|error| format!("{context} returned {error}"))?;
188    for (index, vector) in vectors.iter().enumerate() {
189        if vector.len() != expected_dimension {
190            return Err(format!(
191                "{context} returned inconsistent embedding dimensions: vector 0 has length {expected_dimension}, vector {index} has length {}",
192                vector.len()
193            ));
194        }
195    }
196
197    Ok(())
198}
199
200fn validate_embedding_dimension(dimension: usize) -> Result<(), String> {
201    if dimension == 0 || dimension > MAX_DIMENSION {
202        return Err(format!(
203            "invalid embedding dimension: {dimension}; supported range is 1..={MAX_DIMENSION}"
204        ));
205    }
206
207    Ok(())
208}
209
210/// Normalize a base URL: validate scheme and strip trailing slash.
211/// Does NOT perform SSRF/private-IP validation — call
212/// `validate_base_url_no_ssrf` separately when processing user-supplied config.
213fn normalize_base_url(raw: &str) -> Result<String, String> {
214    let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
215    let scheme = parsed.scheme();
216    if scheme != "http" && scheme != "https" {
217        return Err(format!(
218            "unsupported URL scheme '{}' — only http:// and https:// are allowed",
219            scheme
220        ));
221    }
222    Ok(parsed.to_string().trim_end_matches('/').to_string())
223}
224
225/// Validate that a base URL does not point to a private/loopback address.
226/// Call this on user-supplied config (at configure time) to prevent SSRF.
227/// Not called for programmatically constructed configs (e.g. tests).
228///
229/// **Loopback is allowed.** Self-hosted embedding backends (e.g. Ollama at
230/// `http://127.0.0.1:11434`) are a primary use case for `aft_search`. Loopback
231/// addresses by definition cannot be exploited as SSRF targets — they only
232/// reach services on the same machine. Allowing loopback unblocks Ollama at its
233/// default config without opening up SSRF to LAN/intranet services, which
234/// remain rejected.
235///
236/// **mDNS `.local` is rejected.** mDNS hostnames typically resolve to LAN
237/// devices (printers, homelab servers); rejecting them before DNS lookup keeps
238/// the SSRF guard meaningful for non-loopback private networks.
239pub fn validate_base_url_no_ssrf(raw: &str) -> Result<(), String> {
240    use std::net::{IpAddr, ToSocketAddrs};
241
242    let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
243
244    let host = parsed.host_str().unwrap_or("");
245
246    // Loopback hostnames are explicitly allowed. RFC 6761 mandates that
247    // `localhost` and `*.localhost` resolve to loopback;
248    // `localhost.localdomain` is a historical alias used on some Linux
249    // distros. Self-hosted backends like Ollama use these by default.
250    let is_loopback_host =
251        host == "localhost" || host == "localhost.localdomain" || host.ends_with(".localhost");
252    if is_loopback_host {
253        return Ok(());
254    }
255
256    // mDNS hostnames are typically LAN devices, not loopback. Reject before
257    // DNS lookup so users get a clear error rather than a private-IP error.
258    if host.ends_with(".local") {
259        return Err(format!(
260            "base_url host '{host}' is an mDNS name — only loopback (localhost / 127.0.0.1) and public endpoints are allowed"
261        ));
262    }
263
264    // Resolve the hostname. Reject private/link-local/CGNAT IPs but NOT
265    // loopback (which is by definition same-machine and not an SSRF target).
266    let port = parsed.port_or_known_default().unwrap_or(443);
267    let addr_str = format!("{host}:{port}");
268    let addrs: Vec<IpAddr> = addr_str
269        .to_socket_addrs()
270        .map(|iter| iter.map(|sa| sa.ip()).collect())
271        .unwrap_or_default();
272    for ip in &addrs {
273        if is_private_non_loopback_ip(ip) {
274            return Err(format!(
275                "base_url '{raw}' resolves to a private/reserved IP — only loopback (127.0.0.1) and public endpoints are allowed"
276            ));
277        }
278    }
279
280    Ok(())
281}
282
283/// Returns true for IPv4/IPv6 addresses in private/link-local/CGNAT/benchmark/
284/// multicast/reserved ranges, EXCLUDING loopback (127.0.0.0/8 and ::1). Loopback
285/// is considered safe for SSRF purposes (same-machine, e.g. a local Ollama
286/// endpoint) — see [`validate_base_url_no_ssrf`] for rationale.
287///
288/// Delegates to [`crate::url_fetch::is_private_or_reserved_ip`] so there is one
289/// authoritative reserved-range list (the url_fetch copy is the maintained one;
290/// this used to be a drifting subset that missed e.g. 198.18.0.0/15 and the
291/// multicast/reserved blocks). We only re-add the loopback carve-out the
292/// url_fetch guard deliberately does not make.
293fn is_private_non_loopback_ip(ip: &std::net::IpAddr) -> bool {
294    // Canonicalize so an IPv4-mapped loopback (`::ffff:127.0.0.1`) is also
295    // recognized as loopback, matching the prior carve-out.
296    if ip.to_canonical().is_loopback() {
297        return false;
298    }
299    crate::url_fetch::is_private_or_reserved_ip(*ip)
300}
301
302fn build_openai_embeddings_endpoint(base_url: &str) -> String {
303    if base_url.ends_with("/v1") {
304        format!("{base_url}{DEFAULT_OPENAI_EMBEDDING_PATH}")
305    } else {
306        format!("{base_url}/v1{}", DEFAULT_OPENAI_EMBEDDING_PATH)
307    }
308}
309
310fn build_ollama_embeddings_endpoint(base_url: &str) -> String {
311    if base_url.ends_with("/api") {
312        format!("{base_url}/embed")
313    } else {
314        format!("{base_url}{DEFAULT_OLLAMA_EMBEDDING_PATH}")
315    }
316}
317
318fn normalize_api_key(value: Option<String>) -> Option<String> {
319    value.and_then(|token| {
320        let token = token.trim();
321        if token.is_empty() {
322            None
323        } else {
324            Some(token.to_string())
325        }
326    })
327}
328
329fn is_retryable_embedding_status(status: reqwest::StatusCode) -> bool {
330    status.is_server_error() || status == reqwest::StatusCode::TOO_MANY_REQUESTS
331}
332
333/// Local backends (LM Studio, Ollama, llama.cpp) can return a 4xx — usually
334/// 400/409 — while a model is loading or was just unloaded. Only narrowly known
335/// local-backend loading/unloaded payloads are classified transient; generic
336/// 4xx bodies that merely mention phrases like "loading model" remain
337/// permanent so misconfigurations do not retry forever.
338fn embedding_response_body_is_transient(status: reqwest::StatusCode, raw: &str) -> bool {
339    if !matches!(
340        status,
341        reqwest::StatusCode::BAD_REQUEST
342            | reqwest::StatusCode::CONFLICT
343            | reqwest::StatusCode::REQUEST_TIMEOUT
344            | reqwest::StatusCode::LOCKED
345            | reqwest::StatusCode::TOO_EARLY
346    ) {
347        return false;
348    }
349
350    let lower = raw.to_ascii_lowercase();
351    let normalized = lower.trim();
352
353    normalized.contains("model was unloaded while the request was still in queue")
354        || normalized == "model is loading"
355        || normalized.starts_with("model is loading,")
356        || normalized.contains(r#""error":"model is loading"#)
357        || normalized.contains(r#""message":"model is loading"#)
358        || normalized == "model not loaded"
359        || normalized.contains(r#""error":"model not loaded""#)
360        || normalized.contains(r#""message":"model not loaded""#)
361        || normalized == "loading model into memory"
362        || normalized.contains(r#""error":"loading model into memory""#)
363        || normalized.contains(r#""message":"loading model into memory""#)
364        || normalized == "model is being loaded"
365        || normalized.contains(r#""error":"model is being loaded""#)
366        || normalized.contains(r#""message":"model is being loaded""#)
367        || normalized == "model is currently loading"
368        || normalized.contains(r#""error":"model is currently loading""#)
369        || normalized.contains(r#""message":"model is currently loading""#)
370}
371
372fn is_retryable_embedding_error(error: &reqwest::Error) -> bool {
373    error.is_connect()
374}
375
376/// Whether a send-time error means the backend is *unreachable or temporarily
377/// failing* (vs. a real misconfiguration). Broader than the in-request retry
378/// predicate: a per-request timeout is transient for the build/refresh layer
379/// (the model may still be cold-loading) but we don't burn the 3 fast
380/// in-request attempts on it — the build-level retry rides it out instead.
381fn embedding_send_error_is_transient(error: &reqwest::Error) -> bool {
382    error.is_connect() || error.is_timeout()
383}
384
385fn embedding_response_read_error_is_transient(error: &reqwest::Error) -> bool {
386    embedding_send_error_is_transient(error) || error.is_body() || error.is_decode()
387}
388
389/// Stable machine marker prefixed onto embedding error strings whose root cause
390/// is transient — the backend is down, timing out, or returning 5xx/429, not
391/// misconfigured. The build and corpus-refresh layers key retry-vs-give-up on
392/// this marker (see [`embedding_failure_is_transient`]) instead of re-parsing
393/// error text, so transience stays authoritative at the one site that knows it.
394/// Stripped before any user-facing display via [`strip_transient_embedding_marker`].
395pub const TRANSIENT_EMBEDDING_MARKER: &str = "[transient] ";
396
397/// True when an embedding error carries the transient marker — i.e. retrying
398/// once the backend recovers is the right move, not surfacing a hard failure.
399pub fn embedding_failure_is_transient(error: &str) -> bool {
400    error.contains(TRANSIENT_EMBEDDING_MARKER)
401}
402
403/// Remove the machine transient marker so the message is clean for display.
404pub fn strip_transient_embedding_marker(error: &str) -> String {
405    error.replace(TRANSIENT_EMBEDDING_MARKER, "")
406}
407
408fn sleep_before_embedding_retry(attempt_index: usize) {
409    if let Some(delay_ms) = EMBEDDING_REQUEST_BACKOFF_MS.get(attempt_index) {
410        std::thread::sleep(Duration::from_millis(*delay_ms));
411    }
412}
413
414fn send_embedding_request<F>(mut make_request: F, backend_label: &str) -> Result<String, String>
415where
416    F: FnMut() -> reqwest::blocking::RequestBuilder,
417{
418    for attempt_index in 0..EMBEDDING_REQUEST_MAX_ATTEMPTS {
419        let last_attempt = attempt_index + 1 == EMBEDDING_REQUEST_MAX_ATTEMPTS;
420
421        let response = match make_request().send() {
422            Ok(response) => response,
423            Err(error) => {
424                if !last_attempt && is_retryable_embedding_error(&error) {
425                    sleep_before_embedding_retry(attempt_index);
426                    continue;
427                }
428                // Connect/timeout failures mean the backend is unreachable or
429                // cold-loading — mark transient so the build layer rides it out
430                // and self-heals instead of parking the index in `Failed`.
431                let marker = if embedding_send_error_is_transient(&error) {
432                    TRANSIENT_EMBEDDING_MARKER
433                } else {
434                    ""
435                };
436                return Err(format!("{marker}{backend_label} request failed: {error}"));
437            }
438        };
439
440        let status = response.status();
441        let raw = match response.text() {
442            Ok(raw) => raw,
443            Err(error) => {
444                if !last_attempt && embedding_response_read_error_is_transient(&error) {
445                    sleep_before_embedding_retry(attempt_index);
446                    continue;
447                }
448                let marker = if embedding_response_read_error_is_transient(&error) {
449                    TRANSIENT_EMBEDDING_MARKER
450                } else {
451                    ""
452                };
453                return Err(format!(
454                    "{marker}{backend_label} response read failed: {error}"
455                ));
456            }
457        };
458
459        if status.is_success() {
460            return Ok(raw);
461        }
462
463        // A 4xx whose body says the model is loading/unloaded is transient on
464        // local backends (LM Studio/Ollama), so treat it like a retryable
465        // status: ride it out at both the in-request and build-retry layers.
466        let body_transient = embedding_response_body_is_transient(status, &raw);
467        if !last_attempt && (is_retryable_embedding_status(status) || body_transient) {
468            sleep_before_embedding_retry(attempt_index);
469            continue;
470        }
471
472        // 5xx / 429 are server-side and transient — the backend is overloaded
473        // or briefly unavailable, not misconfigured. A 4xx whose body indicates
474        // the model is (un)loading is also transient (local backend mid-swap).
475        // Other 4xx (auth, bad request, model-not-found) is a real error the
476        // user must fix; no marker.
477        let marker = if is_retryable_embedding_status(status) || body_transient {
478            TRANSIENT_EMBEDDING_MARKER
479        } else {
480            ""
481        };
482        return Err(format!(
483            "{marker}{backend_label} request failed (HTTP {}): {}",
484            status, raw
485        ));
486    }
487
488    unreachable!("embedding request retries exhausted without returning")
489}
490
491impl SemanticEmbeddingModel {
492    pub fn from_config(config: &SemanticBackendConfig) -> Result<Self, String> {
493        let timeout_ms = if config.timeout_ms == 0 {
494            DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS
495        } else {
496            config.timeout_ms
497        };
498
499        let max_batch_size = if config.max_batch_size == 0 {
500            DEFAULT_MAX_BATCH_SIZE
501        } else {
502            config.max_batch_size
503        };
504
505        let api_key_env = normalize_api_key(config.api_key_env.clone());
506        let model = config.model.clone();
507
508        let client = Client::builder()
509            .timeout(Duration::from_millis(timeout_ms))
510            .redirect(reqwest::redirect::Policy::none())
511            .build()
512            .map_err(|error| format!("failed to configure embedding client: {error}"))?;
513
514        let engine = match config.backend {
515            SemanticBackend::Fastembed => {
516                SemanticEmbeddingEngine::Local(LocalEmbedder::new(&model)?)
517            }
518            SemanticBackend::OpenAiCompatible => {
519                let raw = config.base_url.as_ref().ok_or_else(|| {
520                    "base_url is required for openai_compatible backend".to_string()
521                })?;
522                let base_url = normalize_base_url(raw)?;
523
524                let api_key = match api_key_env {
525                    Some(var_name) => Some(env::var(&var_name).map_err(|_| {
526                        format!("missing api_key_env '{var_name}' for openai_compatible backend")
527                    })?),
528                    None => None,
529                };
530
531                SemanticEmbeddingEngine::OpenAiCompatible {
532                    client,
533                    model,
534                    base_url,
535                    api_key,
536                }
537            }
538            SemanticBackend::Ollama => {
539                let raw = config
540                    .base_url
541                    .as_ref()
542                    .ok_or_else(|| "base_url is required for ollama backend".to_string())?;
543                let base_url = normalize_base_url(raw)?;
544
545                SemanticEmbeddingEngine::Ollama {
546                    client,
547                    model,
548                    base_url,
549                }
550            }
551        };
552
553        Ok(Self {
554            backend: config.backend,
555            model: config.model.clone(),
556            base_url: config.base_url.clone(),
557            timeout_ms,
558            max_batch_size,
559            dimension: None,
560            engine,
561            query_embedding_cache: HashMap::new(),
562            query_embedding_cache_order: VecDeque::new(),
563            query_embedding_cache_hits: 0,
564            query_embedding_cache_misses: 0,
565        })
566    }
567
568    pub fn backend(&self) -> SemanticBackend {
569        self.backend
570    }
571
572    pub fn model(&self) -> &str {
573        &self.model
574    }
575
576    pub fn base_url(&self) -> Option<&str> {
577        self.base_url.as_deref()
578    }
579
580    pub fn max_batch_size(&self) -> usize {
581        self.max_batch_size
582    }
583
584    pub fn timeout_ms(&self) -> u64 {
585        self.timeout_ms
586    }
587
588    pub fn fingerprint(
589        &mut self,
590        config: &SemanticBackendConfig,
591    ) -> Result<SemanticIndexFingerprint, String> {
592        let dimension = self.dimension()?;
593        Ok(SemanticIndexFingerprint::from_config(config, dimension))
594    }
595
596    pub fn dimension(&mut self) -> Result<usize, String> {
597        if let Some(dimension) = self.dimension {
598            return Ok(dimension);
599        }
600
601        let dimension = match &mut self.engine {
602            SemanticEmbeddingEngine::Local(model) => {
603                let vectors = model.embed(&["semantic index fingerprint probe".to_string()])?;
604                vectors
605                    .first()
606                    .map(|v| v.len())
607                    .ok_or_else(|| "embedding backend returned no vectors".to_string())?
608            }
609            SemanticEmbeddingEngine::OpenAiCompatible { .. } => {
610                let vectors =
611                    self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
612                vectors
613                    .first()
614                    .map(|v| v.len())
615                    .ok_or_else(|| "embedding backend returned no vectors".to_string())?
616            }
617            SemanticEmbeddingEngine::Ollama { .. } => {
618                let vectors =
619                    self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
620                vectors
621                    .first()
622                    .map(|v| v.len())
623                    .ok_or_else(|| "embedding backend returned no vectors".to_string())?
624            }
625        };
626
627        self.dimension = Some(dimension);
628        Ok(dimension)
629    }
630
631    pub fn embed(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
632        self.embed_texts(texts)
633    }
634
635    pub fn embed_query_cached(&mut self, query: &str) -> Result<Vec<f32>, String> {
636        if let Some(vector) = self.query_embedding_cache.get(query) {
637            self.query_embedding_cache_hits += 1;
638            return Ok(vector.clone());
639        }
640
641        self.query_embedding_cache_misses += 1;
642        let embeddings = self.embed_texts(vec![query.to_string()])?;
643        let vector = embeddings
644            .first()
645            .cloned()
646            .ok_or_else(|| "embedding model returned no query vector".to_string())?;
647
648        if self.query_embedding_cache.len() >= QUERY_EMBEDDING_CACHE_CAP {
649            if let Some(oldest) = self.query_embedding_cache_order.pop_front() {
650                self.query_embedding_cache.remove(&oldest);
651            }
652        }
653        self.query_embedding_cache
654            .insert(query.to_string(), vector.clone());
655        self.query_embedding_cache_order
656            .push_back(query.to_string());
657
658        Ok(vector)
659    }
660
661    pub fn query_embedding_cache_stats(&self) -> (u64, u64, usize) {
662        (
663            self.query_embedding_cache_hits,
664            self.query_embedding_cache_misses,
665            self.query_embedding_cache.len(),
666        )
667    }
668
669    fn embed_texts(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
670        match &mut self.engine {
671            SemanticEmbeddingEngine::Local(model) => model
672                .embed(&texts)
673                .map_err(|error| format!("failed to embed batch: {error}")),
674            SemanticEmbeddingEngine::OpenAiCompatible {
675                client,
676                model,
677                base_url,
678                api_key,
679            } => {
680                let expected_text_count = texts.len();
681                let endpoint = build_openai_embeddings_endpoint(base_url);
682                let body = serde_json::json!({
683                    "input": texts,
684                    "model": model,
685                });
686
687                let raw = send_embedding_request(
688                    || {
689                        // `.json(&body)` sets Content-Type: application/json
690                        // automatically. Do NOT add `.header("Content-Type",
691                        // "application/json")` afterwards — RequestBuilder::header()
692                        // calls HeaderMap::append, which produces TWO Content-Type
693                        // headers on the wire. OpenAI's /v1/embeddings endpoint
694                        // treats duplicate Content-Type as malformed and rejects
695                        // the body with 400 "you must provide a model parameter"
696                        // even when `model` is set. Verified end-to-end against
697                        // api.openai.com. See issue #36.
698                        let mut request = client.post(&endpoint).json(&body);
699
700                        if let Some(api_key) = api_key {
701                            request = request.header("Authorization", format!("Bearer {api_key}"));
702                        }
703
704                        request
705                    },
706                    "openai compatible",
707                )?;
708
709                #[derive(Deserialize)]
710                struct OpenAiResponse {
711                    data: Vec<OpenAiEmbeddingResult>,
712                }
713
714                #[derive(Deserialize)]
715                struct OpenAiEmbeddingResult {
716                    embedding: Vec<f32>,
717                    index: Option<u32>,
718                }
719
720                let parsed: OpenAiResponse = serde_json::from_str(&raw)
721                    .map_err(|error| format!("invalid openai compatible response: {error}"))?;
722                if parsed.data.len() != expected_text_count {
723                    return Err(format!(
724                        "openai compatible response returned {} embeddings for {} inputs",
725                        parsed.data.len(),
726                        expected_text_count
727                    ));
728                }
729
730                let mut vectors = vec![Vec::new(); parsed.data.len()];
731                for (i, item) in parsed.data.into_iter().enumerate() {
732                    let index = item.index.unwrap_or(i as u32) as usize;
733                    if index >= vectors.len() {
734                        return Err(
735                            "openai compatible response contains invalid vector index".to_string()
736                        );
737                    }
738                    vectors[index] = item.embedding;
739                }
740
741                for vector in &vectors {
742                    if vector.is_empty() {
743                        return Err(
744                            "openai compatible response contained missing vectors".to_string()
745                        );
746                    }
747                }
748
749                self.dimension = vectors.first().map(Vec::len);
750                Ok(vectors)
751            }
752            SemanticEmbeddingEngine::Ollama {
753                client,
754                model,
755                base_url,
756            } => {
757                let expected_text_count = texts.len();
758                let endpoint = build_ollama_embeddings_endpoint(base_url);
759
760                #[derive(Serialize)]
761                struct OllamaPayload<'a> {
762                    model: &'a str,
763                    input: Vec<String>,
764                }
765
766                let payload = OllamaPayload {
767                    model,
768                    input: texts,
769                };
770
771                let raw = send_embedding_request(
772                    || {
773                        // `.json(&payload)` sets Content-Type automatically.
774                        // Same duplicate-header trap as the OpenAI branch above
775                        // — most Ollama servers tolerate it, but the
776                        // single-Content-Type form is the correct one.
777                        client.post(&endpoint).json(&payload)
778                    },
779                    "ollama",
780                )?;
781
782                #[derive(Deserialize)]
783                struct OllamaResponse {
784                    embeddings: Vec<Vec<f32>>,
785                }
786
787                let parsed: OllamaResponse = serde_json::from_str(&raw)
788                    .map_err(|error| format!("invalid ollama response: {error}"))?;
789                if parsed.embeddings.is_empty() {
790                    return Err("ollama response returned no embeddings".to_string());
791                }
792                if parsed.embeddings.len() != expected_text_count {
793                    return Err(format!(
794                        "ollama response returned {} embeddings for {} inputs",
795                        parsed.embeddings.len(),
796                        expected_text_count
797                    ));
798                }
799
800                let vectors = parsed.embeddings;
801                for vector in &vectors {
802                    if vector.is_empty() {
803                        return Err("ollama response contained empty embeddings".to_string());
804                    }
805                }
806
807                self.dimension = vectors.first().map(Vec::len);
808                Ok(vectors)
809            }
810        }
811    }
812}
813
814/// Pre-validate ONNX Runtime by attempting a raw dlopen before ort touches it.
815/// This catches broken/incompatible .so files without risking a panic in the ort crate.
816/// Also checks the runtime version via OrtGetApiBase if available.
817pub fn pre_validate_onnx_runtime() -> Result<(), String> {
818    let dylib_path = std::env::var("ORT_DYLIB_PATH").ok();
819
820    #[cfg(any(target_os = "linux", target_os = "macos"))]
821    {
822        #[cfg(target_os = "linux")]
823        let default_name = "libonnxruntime.so";
824        #[cfg(target_os = "macos")]
825        let default_name = "libonnxruntime.dylib";
826
827        let lib_name = dylib_path.as_deref().unwrap_or(default_name);
828
829        unsafe {
830            let c_name = std::ffi::CString::new(lib_name)
831                .map_err(|e| format!("invalid library path: {}", e))?;
832            let handle = libc::dlopen(c_name.as_ptr(), libc::RTLD_NOW);
833            if handle.is_null() {
834                let err = libc::dlerror();
835                let msg = if err.is_null() {
836                    "unknown dlopen error".to_string()
837                } else {
838                    std::ffi::CStr::from_ptr(err).to_string_lossy().into_owned()
839                };
840                return Err(format!(
841                    "ONNX Runtime not found. dlopen('{}') failed: {}. \
842                     Run `npx @cortexkit/aft doctor` to diagnose.",
843                    lib_name, msg
844                ));
845            }
846
847            // Try to detect the runtime version from the actual loaded library
848            // path first. A bare dlopen("libonnxruntime.so") may resolve to an
849            // older system ORT through loader search paths; checking only the
850            // caller-supplied soname would miss that and let ort fail opaquely.
851            let (detected_version, version_source) =
852                detect_ort_version_from_loaded_library(handle, lib_name);
853
854            libc::dlclose(handle);
855
856            // Check version compatibility — we need 1.20+.
857            if let Some(ref version) = detected_version {
858                let parts: Vec<&str> = version.split('.').collect();
859                if let (Some(major), Some(minor)) = (
860                    parts.first().and_then(|s| s.parse::<u32>().ok()),
861                    parts.get(1).and_then(|s| s.parse::<u32>().ok()),
862                ) {
863                    if major != 1 || minor < 20 {
864                        return Err(format_ort_version_mismatch(version, &version_source));
865                    }
866                }
867            }
868        }
869    }
870
871    #[cfg(target_os = "windows")]
872    {
873        // Validate ONNX Runtime availability on Windows by loading the DLL
874        // via LoadLibraryExW before the ort crate attempts its own LoadLibrary.
875        // This way we can produce a friendly error (with installation hints)
876        // instead of a raw LoadLibrary failure from deep inside fastembed.
877        let lib_name = dylib_path.as_deref().unwrap_or("onnxruntime.dll");
878
879        // Use kernel32 LoadLibraryExW for the validation — built-in, no
880        // crate dependency required. GetModuleFileNameW resolves the loaded
881        // DLL path for version probing via the version.dll API.
882        #[link(name = "kernel32")]
883        extern "system" {
884            fn LoadLibraryExW(
885                lpLibFileName: *const u16,
886                hFile: *mut std::ffi::c_void,
887                dwFlags: u32,
888            ) -> *mut std::ffi::c_void;
889            fn FreeLibrary(hLibModule: *mut std::ffi::c_void) -> i32;
890            fn GetModuleFileNameW(
891                hModule: *mut std::ffi::c_void,
892                lpFilename: *mut u16,
893                nSize: u32,
894            ) -> u32;
895        }
896
897        #[link(name = "version")]
898        extern "system" {
899            fn GetFileVersionInfoSizeW(lptstrFilename: *const u16, lpdwHandle: *mut u32) -> u32;
900            fn GetFileVersionInfoW(
901                lptstrFilename: *const u16,
902                dwHandle: u32,
903                dwLen: u32,
904                lpData: *mut std::ffi::c_void,
905            ) -> i32;
906            fn VerQueryValueW(
907                pBlock: *mut std::ffi::c_void,
908                lpSubBlock: *const u16,
909                lplpBuffer: *mut *mut std::ffi::c_void,
910                puLen: *mut u32,
911            ) -> i32;
912        }
913
914        #[repr(C)]
915        struct VS_FIXEDFILEINFO {
916            dw_signature: u32,
917            dw_struc_version: u32,
918            dw_file_version_ms: u32, // HIWORD major, LOWORD minor
919            dw_file_version_ls: u32, // HIWORD build, LOWORD revision
920            dw_product_version_ms: u32,
921            dw_product_version_ls: u32,
922            dw_file_flags_mask: u32,
923            dw_file_flags: u32,
924            dw_file_os: u32,
925            dw_file_type: u32,
926            dw_file_subtype: u32,
927            dw_file_date_ms: u32,
928            dw_file_date_ls: u32,
929        }
930
931        unsafe {
932            use std::os::windows::ffi::OsStrExt;
933            let wide: Vec<u16> = std::ffi::OsStr::new(lib_name)
934                .encode_wide()
935                .chain(std::iter::once(0))
936                .collect();
937
938            let handle = LoadLibraryExW(wide.as_ptr(), std::ptr::null_mut(), 0);
939            if handle.is_null() {
940                let err = std::io::Error::last_os_error();
941                return Err(format!(
942                    "ONNX Runtime not found. LoadLibraryExW('{}') failed: {}. \
943                     Run `npx @cortexkit/aft doctor` to diagnose.",
944                    lib_name, err
945                ));
946            }
947
948            // Probe the file version from PE resources so we can reject
949            // outdated DLLs (e.g. v1.9.x) before the ort crate panics.
950            let mut detected_major: u32 = 0;
951            let mut detected_minor: u32 = 0;
952            // Use MAX_UNICODEPATH (32767) so deeply nested ORT paths (e.g.
953            // long NuGet package paths under %USERPROFILE%) never truncate.
954            // GetModuleFileNameW truncates silently when the buffer is too
955            // small, which causes version probing to fail and the version
956            // check to be bypassed — better to allocate generously.
957            let mut path_buf = [0u16; 32767];
958            let path_len = GetModuleFileNameW(handle, path_buf.as_mut_ptr(), 32767);
959            if path_len > 0 {
960                let mut dummy_handle: u32 = 0;
961                let info_size = GetFileVersionInfoSizeW(path_buf.as_ptr(), &mut dummy_handle);
962                if info_size > 0 {
963                    let mut info = vec![0u8; info_size as usize];
964                    if GetFileVersionInfoW(
965                        path_buf.as_ptr(),
966                        0,
967                        info_size,
968                        info.as_mut_ptr() as *mut std::ffi::c_void,
969                    ) != 0
970                    {
971                        let sub_block = "\\\0".encode_utf16().collect::<Vec<u16>>();
972                        let mut vs_info: *mut std::ffi::c_void = std::ptr::null_mut();
973                        let mut vs_len: u32 = 0;
974                        if VerQueryValueW(
975                            info.as_mut_ptr() as *mut std::ffi::c_void,
976                            sub_block.as_ptr(),
977                            &mut vs_info,
978                            &mut vs_len,
979                        ) != 0
980                            && !vs_info.is_null()
981                        {
982                            let fixed = vs_info as *const VS_FIXEDFILEINFO;
983                            detected_major = (*fixed).dw_file_version_ms >> 16;
984                            detected_minor = (*fixed).dw_file_version_ms & 0xFFFF;
985                        }
986                    }
987                }
988            }
989
990            FreeLibrary(handle);
991
992            // Version compatibility check (mirrors the Linux/macOS path).
993            // If version could not be detected (detected_major == 0) we let
994            // the load succeed — the ort crate will diagnose further.
995            if detected_major != 0 && (detected_major != 1 || detected_minor < 20) {
996                let ver = format!("{}.{}", detected_major, detected_minor);
997                return Err(format_ort_version_mismatch(&ver, lib_name));
998            }
999        }
1000    }
1001
1002    Ok(())
1003}
1004
1005#[cfg(any(target_os = "linux", target_os = "macos"))]
1006unsafe fn loaded_library_path_from_handle(handle: *mut std::ffi::c_void) -> Option<String> {
1007    let symbol_name = std::ffi::CString::new("OrtGetApiBase").ok()?;
1008    let symbol = unsafe { libc::dlsym(handle, symbol_name.as_ptr()) };
1009    if symbol.is_null() {
1010        return None;
1011    }
1012
1013    let mut info = std::mem::MaybeUninit::<libc::Dl_info>::uninit();
1014    if unsafe { libc::dladdr(symbol, info.as_mut_ptr()) } == 0 {
1015        return None;
1016    }
1017
1018    let info = unsafe { info.assume_init() };
1019    if info.dli_fname.is_null() {
1020        return None;
1021    }
1022
1023    Some(
1024        unsafe { std::ffi::CStr::from_ptr(info.dli_fname) }
1025            .to_string_lossy()
1026            .into_owned(),
1027    )
1028}
1029
1030#[cfg(any(target_os = "linux", target_os = "macos"))]
1031fn detect_ort_version_from_resolved_or_requested(
1032    resolved_path: Option<String>,
1033    requested_lib_name: &str,
1034) -> (Option<String>, String) {
1035    if let Some(path) = resolved_path {
1036        if let Some(version) = detect_ort_version_from_path(&path) {
1037            return (Some(version), path);
1038        }
1039        return (detect_ort_version_from_path(requested_lib_name), path);
1040    }
1041
1042    (
1043        detect_ort_version_from_path(requested_lib_name),
1044        requested_lib_name.to_string(),
1045    )
1046}
1047
1048#[cfg(any(target_os = "linux", target_os = "macos"))]
1049fn detect_ort_version_from_loaded_library(
1050    handle: *mut std::ffi::c_void,
1051    requested_lib_name: &str,
1052) -> (Option<String>, String) {
1053    detect_ort_version_from_resolved_or_requested(
1054        unsafe { loaded_library_path_from_handle(handle) },
1055        requested_lib_name,
1056    )
1057}
1058
1059/// Try to extract the ORT version from the library filename or resolved symlink.
1060/// Examples: "libonnxruntime.so.1.19.0" → "1.19.0", "libonnxruntime.1.24.4.dylib" → "1.24.4"
1061#[cfg(any(target_os = "linux", target_os = "macos"))]
1062fn detect_ort_version_from_path(lib_path: &str) -> Option<String> {
1063    let path = std::path::Path::new(lib_path);
1064
1065    // Try the path as given, then follow symlinks
1066    for candidate in [Some(path.to_path_buf()), std::fs::canonicalize(path).ok()]
1067        .into_iter()
1068        .flatten()
1069    {
1070        if let Some(name) = candidate.file_name().and_then(|n| n.to_str()) {
1071            if let Some(version) = extract_version_from_filename(name) {
1072                return Some(version);
1073            }
1074        }
1075    }
1076
1077    // Also check for versioned siblings in the same directory
1078    if let Some(parent) = path.parent() {
1079        if let Ok(entries) = std::fs::read_dir(parent) {
1080            for entry in entries.flatten() {
1081                if let Some(name) = entry.file_name().to_str() {
1082                    if name.starts_with("libonnxruntime") {
1083                        if let Some(version) = extract_version_from_filename(name) {
1084                            return Some(version);
1085                        }
1086                    }
1087                }
1088            }
1089        }
1090    }
1091
1092    None
1093}
1094
1095/// Extract version from filenames like "libonnxruntime.so.1.19.0" or "libonnxruntime.1.24.4.dylib"
1096#[cfg(any(target_os = "linux", target_os = "macos"))]
1097fn extract_version_from_filename(name: &str) -> Option<String> {
1098    // Match patterns: .so.X.Y.Z or .X.Y.Z.dylib or .X.Y.Z.so
1099    let re = regex::Regex::new(r"(\d+\.\d+\.\d+)").ok()?;
1100    re.find(name).map(|m| m.as_str().to_string())
1101}
1102
1103fn suggest_removal_command(lib_path: &str) -> String {
1104    if lib_path.starts_with("/usr/local/lib")
1105        || lib_path == "libonnxruntime.so"
1106        || lib_path == "libonnxruntime.dylib"
1107    {
1108        #[cfg(target_os = "linux")]
1109        return "   sudo rm /usr/local/lib/libonnxruntime* && sudo ldconfig".to_string();
1110        #[cfg(target_os = "macos")]
1111        return "   sudo rm /usr/local/lib/libonnxruntime*".to_string();
1112    }
1113    format!("   rm '{}'", lib_path)
1114}
1115
1116/// Build the user-facing error message for an incompatible ONNX Runtime
1117/// install. Extracted as a pure helper so we can unit-test the wording
1118/// stability — the auto-fix recommendation must always come first because
1119/// it's the only safe option, and the system-rm step must remain present
1120/// because some users prefer the system-wide cleanup path.
1121pub(crate) fn format_ort_version_mismatch(version: &str, lib_name: &str) -> String {
1122    format!(
1123        "ONNX Runtime version mismatch: found v{} at '{}', but AFT requires v1.20+. \
1124         Solutions:\n\
1125         1. Auto-fix (recommended): run `npx @cortexkit/aft doctor --fix`. \
1126         This downloads AFT-managed ONNX Runtime v1.24 into AFT's storage and \
1127         configures the bridge to load it instead of the system library — no \
1128         changes to '{}'.\n\
1129         2. Remove the old library and restart (AFT auto-downloads the correct version on next start):\n\
1130         {}\n\
1131         3. Or install ONNX Runtime 1.24 system-wide: https://github.com/microsoft/onnxruntime/releases/tag/v1.24.0\n\
1132         4. Run `npx @cortexkit/aft doctor` for full diagnostics.",
1133        version,
1134        lib_name,
1135        lib_name,
1136        suggest_removal_command(lib_name),
1137    )
1138}
1139
1140pub fn is_onnx_runtime_unavailable(message: &str) -> bool {
1141    if message.trim_start().starts_with("ONNX Runtime not found.") {
1142        return true;
1143    }
1144
1145    let message = message.to_ascii_lowercase();
1146    let mentions_onnx_runtime = ["onnx runtime", "onnxruntime", "libonnxruntime"]
1147        .iter()
1148        .any(|pattern| message.contains(pattern));
1149    let mentions_dynamic_load_failure = [
1150        "shared library",
1151        "dynamic library",
1152        "failed to load",
1153        "could not load",
1154        "unable to load",
1155        "dlopen",
1156        "loadlibrary",
1157        "no such file",
1158        "not found",
1159    ]
1160    .iter()
1161    .any(|pattern| message.contains(pattern));
1162
1163    mentions_onnx_runtime && mentions_dynamic_load_failure
1164}
1165
1166pub fn format_embedding_init_error(error: impl Display) -> String {
1167    let message = error.to_string();
1168
1169    if is_onnx_runtime_unavailable(&message) {
1170        return format!("{ONNX_RUNTIME_INSTALL_HINT} Original error: {message}");
1171    }
1172
1173    format!("failed to initialize semantic embedding model: {message}")
1174}
1175
1176/// A chunk of code ready for embedding — derived from a Symbol with context enrichment
1177#[derive(Debug, Clone)]
1178pub struct SemanticChunk {
1179    /// Absolute file path
1180    pub file: PathBuf,
1181    /// Symbol name
1182    pub name: String,
1183    /// Symbol kind (function, class, struct, etc.)
1184    pub kind: SymbolKind,
1185    /// Line range (0-based internally, inclusive)
1186    pub start_line: u32,
1187    pub end_line: u32,
1188    /// Whether the symbol is exported
1189    pub exported: bool,
1190    /// The enriched text that gets embedded (scope + signature + body snippet)
1191    pub embed_text: String,
1192    /// Short code snippet for display in results
1193    pub snippet: String,
1194}
1195
1196/// A stored embedding entry — chunk metadata + vector
1197#[derive(Debug, Clone)]
1198pub struct EmbeddingEntry {
1199    chunk: SemanticChunk,
1200    vector: Vec<f32>,
1201}
1202
1203/// The semantic index — stores embeddings for all symbols in a project
1204#[derive(Debug, Clone)]
1205pub struct SemanticIndex {
1206    entries: Vec<EmbeddingEntry>,
1207    /// Track which files are indexed and their mtime for staleness detection
1208    file_mtimes: HashMap<PathBuf, SystemTime>,
1209    /// Track indexed file sizes alongside mtimes for staleness detection
1210    file_sizes: HashMap<PathBuf, u64>,
1211    file_hashes: HashMap<PathBuf, blake3::Hash>,
1212    /// Embedding dimension (384 for MiniLM-L6-v2)
1213    dimension: usize,
1214    fingerprint: Option<SemanticIndexFingerprint>,
1215    project_root: PathBuf,
1216    deferred_files: HashSet<PathBuf>,
1217}
1218
1219#[derive(Debug, Clone, Copy)]
1220struct IndexedFileMetadata {
1221    mtime: SystemTime,
1222    size: u64,
1223    content_hash: blake3::Hash,
1224}
1225
1226/// Result of an incremental refresh of the semantic index. Counts are file
1227/// counts; `total_processed` is the number of current/deleted files considered.
1228#[derive(Debug, Default, Clone, Copy)]
1229pub struct RefreshSummary {
1230    pub changed: usize,
1231    pub added: usize,
1232    pub deleted: usize,
1233    pub total_processed: usize,
1234}
1235
1236impl RefreshSummary {
1237    /// True when no files were touched.
1238    pub fn is_noop(&self) -> bool {
1239        self.changed == 0 && self.added == 0 && self.deleted == 0
1240    }
1241}
1242
1243#[derive(Debug, Default)]
1244pub struct InvalidatedFilesRefresh {
1245    /// Full replacement entries for `completed_paths`, not just newly embedded
1246    /// chunks. `apply_refresh_update` removes completed paths before extending
1247    /// this set, so reused chunks must travel in this delta too.
1248    pub added_entries: Vec<EmbeddingEntry>,
1249    pub updated_metadata: Vec<(PathBuf, FileFreshness)>,
1250    pub completed_paths: Vec<PathBuf>,
1251    pub summary: RefreshSummary,
1252}
1253
1254#[derive(Debug, Clone)]
1255struct ReusableEmbedding {
1256    embed_text: String,
1257    vector: Vec<f32>,
1258}
1259
1260type ChunkReuseMap = HashMap<PathBuf, HashMap<blake3::Hash, Vec<ReusableEmbedding>>>;
1261
1262/// Search result from a semantic query
1263#[derive(Debug, Clone)]
1264pub struct SemanticResult {
1265    pub file: PathBuf,
1266    pub name: String,
1267    pub kind: SymbolKind,
1268    pub start_line: u32,
1269    pub end_line: u32,
1270    pub exported: bool,
1271    pub snippet: String,
1272    pub score: f32,
1273    pub source: &'static str,
1274}
1275
1276impl SemanticIndex {
1277    pub fn new(project_root: PathBuf, dimension: usize) -> Self {
1278        debug_assert!(project_root.is_absolute());
1279        Self {
1280            entries: Vec::new(),
1281            file_mtimes: HashMap::new(),
1282            file_sizes: HashMap::new(),
1283            file_hashes: HashMap::new(),
1284            dimension,
1285            fingerprint: None,
1286            project_root,
1287            deferred_files: HashSet::new(),
1288        }
1289    }
1290
1291    /// Number of embedded symbol entries.
1292    pub fn entry_count(&self) -> usize {
1293        self.entries.len()
1294    }
1295
1296    /// Number of files currently tracked by the semantic index.
1297    pub fn indexed_file_count(&self) -> usize {
1298        self.file_mtimes.len()
1299    }
1300
1301    /// Human-readable status label for the index.
1302    pub fn status_label(&self) -> &'static str {
1303        if self.entries.is_empty() {
1304            "empty"
1305        } else {
1306            "ready"
1307        }
1308    }
1309
1310    fn collect_chunks(
1311        project_root: &Path,
1312        files: &[PathBuf],
1313    ) -> (Vec<SemanticChunk>, HashMap<PathBuf, IndexedFileMetadata>) {
1314        let collect_started = std::time::Instant::now();
1315        let per_file: Vec<(
1316            PathBuf,
1317            Result<(IndexedFileMetadata, Vec<SemanticChunk>), String>,
1318        )> = files
1319            .par_iter()
1320            .map_init(HashMap::new, |parsers, file| {
1321                let result = collect_file_metadata(file).and_then(|metadata| {
1322                    collect_file_chunks(project_root, file, parsers)
1323                        .map(|chunks| (metadata, chunks))
1324                });
1325                (file.clone(), result)
1326            })
1327            .collect();
1328
1329        let mut chunks: Vec<SemanticChunk> = Vec::new();
1330        let mut file_metadata: HashMap<PathBuf, IndexedFileMetadata> = HashMap::new();
1331
1332        for (file, result) in per_file {
1333            match result {
1334                Ok((metadata, file_chunks)) => {
1335                    file_metadata.insert(file, metadata);
1336                    chunks.extend(file_chunks);
1337                }
1338                Err(error) => {
1339                    // "unsupported file extension" is expected for non-code files
1340                    // (json, xml, .gitignore, etc.) that get included in the
1341                    // project walk. Pre-fix this was swallowed by .unwrap_or_default();
1342                    // we now skip silently to keep the log clean. Only real read/parse
1343                    // errors are worth surfacing.
1344                    if error == "unsupported file extension" {
1345                        continue;
1346                    }
1347                    slog_warn!(
1348                        "failed to collect semantic chunks for {}: {}",
1349                        file.display(),
1350                        error
1351                    );
1352                }
1353            }
1354        }
1355
1356        slog_info!(
1357            "semantic collect: {} chunks from {} files in {} ms",
1358            chunks.len(),
1359            file_metadata.len(),
1360            collect_started.elapsed().as_millis()
1361        );
1362
1363        (chunks, file_metadata)
1364    }
1365
1366    fn build_chunk_reuse_map(&self, files: &[PathBuf]) -> ChunkReuseMap {
1367        let requested: HashSet<&Path> = files.iter().map(PathBuf::as_path).collect();
1368        let mut reuse_map: ChunkReuseMap = HashMap::new();
1369
1370        for entry in &self.entries {
1371            if !requested.contains(entry.chunk.file.as_path()) {
1372                continue;
1373            }
1374
1375            // `embed_text` is already persisted in the current on-disk format,
1376            // so refresh-time reuse can hash it in memory and confirm the exact
1377            // string without bumping `SEMANTIC_INDEX_VERSION` and forcing every
1378            // user through a full rebuild.
1379            let hash = blake3::hash(entry.chunk.embed_text.as_bytes());
1380            reuse_map
1381                .entry(entry.chunk.file.clone())
1382                .or_default()
1383                .entry(hash)
1384                .or_default()
1385                .push(ReusableEmbedding {
1386                    embed_text: entry.chunk.embed_text.clone(),
1387                    vector: entry.vector.clone(),
1388                });
1389        }
1390
1391        reuse_map
1392    }
1393
1394    fn reusable_vector_for_chunk(
1395        reuse_map: &ChunkReuseMap,
1396        chunk: &SemanticChunk,
1397    ) -> Option<Vec<f32>> {
1398        let hash = blake3::hash(chunk.embed_text.as_bytes());
1399        reuse_map
1400            .get(&chunk.file)?
1401            .get(&hash)?
1402            .iter()
1403            .find(|candidate| candidate.embed_text == chunk.embed_text)
1404            .map(|candidate| candidate.vector.clone())
1405    }
1406
1407    fn entries_for_chunks_with_reuse<F, P>(
1408        chunks: Vec<SemanticChunk>,
1409        reuse_map: &ChunkReuseMap,
1410        embed_fn: &mut F,
1411        max_batch_size: usize,
1412        initial_observed_dimension: Option<usize>,
1413        refresh_label: &str,
1414        progress: &mut P,
1415    ) -> Result<(Vec<EmbeddingEntry>, Option<usize>), String>
1416    where
1417        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1418        P: FnMut(usize, usize),
1419    {
1420        let total_chunks = chunks.len();
1421        progress(0, total_chunks);
1422
1423        let mut entries_by_chunk: Vec<Option<EmbeddingEntry>> = vec![None; total_chunks];
1424        let mut misses: Vec<(usize, SemanticChunk)> = Vec::new();
1425
1426        for (chunk_index, chunk) in chunks.into_iter().enumerate() {
1427            if let Some(vector) = Self::reusable_vector_for_chunk(reuse_map, &chunk) {
1428                entries_by_chunk[chunk_index] = Some(EmbeddingEntry { chunk, vector });
1429            } else {
1430                misses.push((chunk_index, chunk));
1431            }
1432        }
1433
1434        let mut completed = total_chunks.saturating_sub(misses.len());
1435        if completed > 0 {
1436            progress(completed, total_chunks);
1437        }
1438
1439        let batch_size = max_batch_size.max(1);
1440        let mut observed_dimension = initial_observed_dimension;
1441
1442        for batch_start in (0..misses.len()).step_by(batch_size) {
1443            let batch_end = (batch_start + batch_size).min(misses.len());
1444            let batch_texts: Vec<String> = misses[batch_start..batch_end]
1445                .iter()
1446                .map(|(_, chunk)| chunk.embed_text.clone())
1447                .collect();
1448
1449            let vectors = embed_fn(batch_texts)?;
1450            validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1451
1452            if let Some(dim) = vectors.first().map(|vector| vector.len()) {
1453                match observed_dimension {
1454                    None => observed_dimension = Some(dim),
1455                    Some(expected) if dim != expected => {
1456                        return Err(format!(
1457                            "embedding dimension changed during {refresh_label}: \
1458                             cached index uses {expected}, new vectors use {dim}"
1459                        ));
1460                    }
1461                    _ => {}
1462                }
1463            }
1464
1465            for (i, vector) in vectors.into_iter().enumerate() {
1466                let (chunk_index, chunk) = misses[batch_start + i].clone();
1467                entries_by_chunk[chunk_index] = Some(EmbeddingEntry { chunk, vector });
1468            }
1469
1470            completed += batch_end - batch_start;
1471            progress(completed, total_chunks);
1472        }
1473
1474        let entries = entries_by_chunk
1475            .into_iter()
1476            .map(|entry| entry.expect("semantic refresh accounted for every chunk"))
1477            .collect();
1478
1479        Ok((entries, observed_dimension))
1480    }
1481
1482    fn build_from_chunks<F, P>(
1483        project_root: &Path,
1484        chunks: Vec<SemanticChunk>,
1485        file_metadata: HashMap<PathBuf, IndexedFileMetadata>,
1486        embed_fn: &mut F,
1487        max_batch_size: usize,
1488        mut progress: Option<&mut P>,
1489    ) -> Result<Self, String>
1490    where
1491        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1492        P: FnMut(usize, usize),
1493    {
1494        debug_assert!(project_root.is_absolute());
1495        let total_chunks = chunks.len();
1496
1497        if chunks.is_empty() {
1498            return Ok(Self {
1499                entries: Vec::new(),
1500                file_mtimes: file_metadata
1501                    .iter()
1502                    .map(|(path, metadata)| (path.clone(), metadata.mtime))
1503                    .collect(),
1504                file_sizes: file_metadata
1505                    .iter()
1506                    .map(|(path, metadata)| (path.clone(), metadata.size))
1507                    .collect(),
1508                file_hashes: file_metadata
1509                    .into_iter()
1510                    .map(|(path, metadata)| (path, metadata.content_hash))
1511                    .collect(),
1512                dimension: DEFAULT_DIMENSION,
1513                fingerprint: None,
1514                project_root: project_root.to_path_buf(),
1515                deferred_files: HashSet::new(),
1516            });
1517        }
1518
1519        // Embed in batches
1520        let mut entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1521        let mut expected_dimension: Option<usize> = None;
1522        let batch_size = max_batch_size.max(1);
1523        let embed_started = std::time::Instant::now();
1524        let batch_count = total_chunks.div_ceil(batch_size);
1525        for batch_start in (0..chunks.len()).step_by(batch_size) {
1526            let batch_end = (batch_start + batch_size).min(chunks.len());
1527            let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1528                .iter()
1529                .map(|c| c.embed_text.clone())
1530                .collect();
1531
1532            let vectors = embed_fn(batch_texts)?;
1533            validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1534
1535            // Track consistent dimension across all batches
1536            if let Some(dim) = vectors.first().map(|v| v.len()) {
1537                match expected_dimension {
1538                    None => expected_dimension = Some(dim),
1539                    Some(expected) if dim != expected => {
1540                        return Err(format!(
1541                            "embedding dimension changed across batches: expected {expected}, got {dim}"
1542                        ));
1543                    }
1544                    _ => {}
1545                }
1546            }
1547
1548            for (i, vector) in vectors.into_iter().enumerate() {
1549                let chunk_idx = batch_start + i;
1550                entries.push(EmbeddingEntry {
1551                    chunk: chunks[chunk_idx].clone(),
1552                    vector,
1553                });
1554            }
1555
1556            if let Some(callback) = progress.as_mut() {
1557                callback(entries.len(), total_chunks);
1558            }
1559        }
1560
1561        let embed_ms = embed_started.elapsed().as_millis();
1562        let rate = (total_chunks as u128 * 1000)
1563            .checked_div(embed_ms)
1564            .unwrap_or(0) as u64;
1565        slog_info!(
1566            "semantic embed: {} chunks in {} batches, {} ms ({} chunks/s)",
1567            total_chunks,
1568            batch_count,
1569            embed_ms,
1570            rate
1571        );
1572
1573        let dimension = entries
1574            .first()
1575            .map(|e| e.vector.len())
1576            .unwrap_or(DEFAULT_DIMENSION);
1577
1578        Ok(Self {
1579            entries,
1580            file_mtimes: file_metadata
1581                .iter()
1582                .map(|(path, metadata)| (path.clone(), metadata.mtime))
1583                .collect(),
1584            file_sizes: file_metadata
1585                .iter()
1586                .map(|(path, metadata)| (path.clone(), metadata.size))
1587                .collect(),
1588            file_hashes: file_metadata
1589                .into_iter()
1590                .map(|(path, metadata)| (path, metadata.content_hash))
1591                .collect(),
1592            dimension,
1593            fingerprint: None,
1594            project_root: project_root.to_path_buf(),
1595            deferred_files: HashSet::new(),
1596        })
1597    }
1598
1599    /// Build the semantic index from a set of files using the provided embedding function.
1600    /// `embed_fn` takes a batch of texts and returns a batch of embedding vectors.
1601    pub fn build<F>(
1602        project_root: &Path,
1603        files: &[PathBuf],
1604        embed_fn: &mut F,
1605        max_batch_size: usize,
1606    ) -> Result<Self, String>
1607    where
1608        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1609    {
1610        let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1611        Self::build_from_chunks(
1612            project_root,
1613            chunks,
1614            file_mtimes,
1615            embed_fn,
1616            max_batch_size,
1617            Option::<&mut fn(usize, usize)>::None,
1618        )
1619    }
1620
1621    /// Build the semantic index and report embedding progress using entry counts.
1622    pub fn build_with_progress<F, P>(
1623        project_root: &Path,
1624        files: &[PathBuf],
1625        embed_fn: &mut F,
1626        max_batch_size: usize,
1627        progress: &mut P,
1628    ) -> Result<Self, String>
1629    where
1630        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1631        P: FnMut(usize, usize),
1632    {
1633        let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1634        let total_chunks = chunks.len();
1635        progress(0, total_chunks);
1636        Self::build_from_chunks(
1637            project_root,
1638            chunks,
1639            file_mtimes,
1640            embed_fn,
1641            max_batch_size,
1642            Some(progress),
1643        )
1644    }
1645
1646    /// Incrementally refresh entries for changed/new files only, preserving cached
1647    /// embeddings for unchanged files. Used when loading the index from disk and
1648    /// finding that a small fraction of files have moved on, deleted, or appeared.
1649    ///
1650    /// Returns `RefreshSummary` describing what changed. On success, `self` is
1651    /// mutated in place and remains a valid index.
1652    ///
1653    /// `current_files` is the full set of files the project considers indexable
1654    /// (typically `walk_project_files(...)`). Files in the cache that are no
1655    /// longer in this set are treated as deleted.
1656    pub fn refresh_stale_files<F, P>(
1657        &mut self,
1658        project_root: &Path,
1659        current_files: &[PathBuf],
1660        embed_fn: &mut F,
1661        max_batch_size: usize,
1662        progress: &mut P,
1663    ) -> Result<RefreshSummary, String>
1664    where
1665        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1666        P: FnMut(usize, usize),
1667    {
1668        self.backfill_missing_file_sizes();
1669
1670        // 1. Bucket files into deleted / changed / added.
1671        let current_set: HashSet<&Path> = current_files.iter().map(PathBuf::as_path).collect();
1672        self.deferred_files
1673            .retain(|path| current_set.contains(path.as_path()));
1674        let total_processed = current_set.len() + self.file_mtimes.len()
1675            - self
1676                .file_mtimes
1677                .keys()
1678                .filter(|path| current_set.contains(path.as_path()))
1679                .count();
1680
1681        // Files in cache that disappeared from disk OR are no longer in the
1682        // walked set. Both cases need their entries dropped.
1683        let mut deleted: Vec<PathBuf> = Vec::new();
1684        let mut changed: Vec<PathBuf> = Vec::new();
1685        let indexed_paths: Vec<PathBuf> = self.file_mtimes.keys().cloned().collect();
1686        for indexed_path in &indexed_paths {
1687            if !current_set.contains(indexed_path.as_path()) {
1688                deleted.push(indexed_path.clone());
1689                continue;
1690            }
1691            let cached = match (
1692                self.file_mtimes.get(indexed_path),
1693                self.file_sizes.get(indexed_path),
1694                self.file_hashes.get(indexed_path),
1695            ) {
1696                (Some(mtime), Some(size), Some(hash)) => Some(FileFreshness {
1697                    mtime: *mtime,
1698                    size: *size,
1699                    content_hash: *hash,
1700                }),
1701                _ => None,
1702            };
1703            match cached
1704                .map(|freshness| cache_freshness::verify_file_strict(indexed_path, &freshness))
1705            {
1706                Some(FreshnessVerdict::HotFresh) => {}
1707                Some(FreshnessVerdict::ContentFresh {
1708                    new_mtime,
1709                    new_size,
1710                }) => {
1711                    self.file_mtimes.insert(indexed_path.clone(), new_mtime);
1712                    self.file_sizes.insert(indexed_path.clone(), new_size);
1713                }
1714                Some(FreshnessVerdict::Stale | FreshnessVerdict::Deleted) | None => {
1715                    changed.push(indexed_path.clone());
1716                }
1717            }
1718        }
1719
1720        // Files in walk that were never indexed.
1721        let mut added: Vec<PathBuf> = Vec::new();
1722        for path in current_files {
1723            if !self.file_mtimes.contains_key(path) {
1724                added.push(path.clone());
1725            }
1726        }
1727
1728        // Fast path: nothing to do.
1729        if deleted.is_empty() && changed.is_empty() && added.is_empty() {
1730            progress(0, 0);
1731            return Ok(RefreshSummary {
1732                total_processed,
1733                ..RefreshSummary::default()
1734            });
1735        }
1736
1737        // 2. Drop entries for deleted files immediately. Changed files are only
1738        //    replaced after successful re-extraction + embedding so transient
1739        //    read/parse errors keep the stale-but-valid cache entry.
1740        if !deleted.is_empty() {
1741            self.remove_indexed_files(&deleted);
1742        }
1743
1744        // 3. Embed the changed + added set, if any.
1745        let mut to_embed: Vec<PathBuf> = Vec::with_capacity(changed.len() + added.len());
1746        to_embed.extend(changed.iter().cloned());
1747        to_embed.extend(added.iter().cloned());
1748
1749        if to_embed.is_empty() {
1750            // Only deletions happened.
1751            progress(0, 0);
1752            return Ok(RefreshSummary {
1753                changed: 0,
1754                added: 0,
1755                deleted: deleted.len(),
1756                total_processed,
1757            });
1758        }
1759
1760        let reuse_map = self.build_chunk_reuse_map(&changed);
1761        let (chunks, fresh_metadata) = Self::collect_chunks(project_root, &to_embed);
1762        let changed_set: HashSet<&Path> = changed.iter().map(PathBuf::as_path).collect();
1763        let vanished = to_embed
1764            .iter()
1765            .filter(|path| {
1766                changed_set.contains(path.as_path())
1767                    && !fresh_metadata.contains_key(*path)
1768                    && !path.exists()
1769            })
1770            .cloned()
1771            .collect::<Vec<_>>();
1772        if !vanished.is_empty() {
1773            self.remove_indexed_files(&vanished);
1774            deleted.extend(vanished);
1775        }
1776
1777        if chunks.is_empty() {
1778            progress(0, 0);
1779            let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1780            for file in &successful_files {
1781                self.deferred_files.remove(file);
1782            }
1783            if !successful_files.is_empty() {
1784                self.entries
1785                    .retain(|entry| !successful_files.contains(&entry.chunk.file));
1786            }
1787            let changed_count = changed
1788                .iter()
1789                .filter(|path| successful_files.contains(*path))
1790                .count();
1791            let added_count = added
1792                .iter()
1793                .filter(|path| successful_files.contains(*path))
1794                .count();
1795            for (file, metadata) in fresh_metadata {
1796                self.file_mtimes.insert(file.clone(), metadata.mtime);
1797                self.file_sizes.insert(file.clone(), metadata.size);
1798                self.file_hashes.insert(file.clone(), metadata.content_hash);
1799            }
1800            return Ok(RefreshSummary {
1801                changed: changed_count,
1802                added: added_count,
1803                deleted: deleted.len(),
1804                total_processed,
1805            });
1806        }
1807
1808        // 4. Build the full replacement set, reusing cached vectors for chunks
1809        //    whose embed_text is unchanged and embedding only cache misses.
1810        let existing_dimension = if self.entries.is_empty() {
1811            None
1812        } else {
1813            Some(self.dimension)
1814        };
1815        let (new_entries, observed_dimension) = Self::entries_for_chunks_with_reuse(
1816            chunks,
1817            &reuse_map,
1818            embed_fn,
1819            max_batch_size,
1820            existing_dimension,
1821            "incremental refresh",
1822            progress,
1823        )?;
1824
1825        let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1826        for file in &successful_files {
1827            self.deferred_files.remove(file);
1828        }
1829        if !successful_files.is_empty() {
1830            self.entries
1831                .retain(|entry| !successful_files.contains(&entry.chunk.file));
1832        }
1833
1834        self.entries.extend(new_entries);
1835        for (file, metadata) in fresh_metadata {
1836            self.file_mtimes.insert(file.clone(), metadata.mtime);
1837            self.file_sizes.insert(file.clone(), metadata.size);
1838            self.file_hashes.insert(file, metadata.content_hash);
1839        }
1840        if let Some(dim) = observed_dimension {
1841            self.dimension = dim;
1842        }
1843
1844        Ok(RefreshSummary {
1845            changed: changed
1846                .iter()
1847                .filter(|path| successful_files.contains(*path))
1848                .count(),
1849            added: added
1850                .iter()
1851                .filter(|path| successful_files.contains(*path))
1852                .count(),
1853            deleted: deleted.len(),
1854            total_processed,
1855        })
1856    }
1857
1858    /// Refresh exactly the files invalidated by the live watcher, without
1859    /// treating the provided path list as the whole project. This is the
1860    /// watcher-side counterpart to `refresh_stale_files`: it drops any stale
1861    /// entries for the requested paths from this in-memory index, re-extracts
1862    /// whatever still exists on disk, embeds those chunks, and returns the
1863    /// delta needed for another in-memory index to apply the same update.
1864    pub fn refresh_invalidated_files<F, P>(
1865        &mut self,
1866        project_root: &Path,
1867        paths: &[PathBuf],
1868        embed_fn: &mut F,
1869        max_batch_size: usize,
1870        max_files: usize,
1871        progress: &mut P,
1872    ) -> Result<InvalidatedFilesRefresh, String>
1873    where
1874        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1875        P: FnMut(usize, usize),
1876    {
1877        self.backfill_missing_file_sizes();
1878
1879        self.deferred_files.retain(|path| path.exists());
1880        let mut requested_paths = paths.to_vec();
1881        requested_paths.extend(self.deferred_files.iter().cloned());
1882        requested_paths.sort();
1883        requested_paths.dedup();
1884        let total_processed = requested_paths.len();
1885
1886        if requested_paths.is_empty() {
1887            progress(0, 0);
1888            return Ok(InvalidatedFilesRefresh {
1889                summary: RefreshSummary {
1890                    total_processed,
1891                    ..RefreshSummary::default()
1892                },
1893                ..InvalidatedFilesRefresh::default()
1894            });
1895        }
1896
1897        let previously_indexed: HashSet<PathBuf> = requested_paths
1898            .iter()
1899            .filter(|path| self.file_mtimes.contains_key(*path))
1900            .cloned()
1901            .collect();
1902        let reuse_map = self.build_chunk_reuse_map(&requested_paths);
1903
1904        // The watcher path has already invalidated these files in the request
1905        // thread's live index. Mirror that behavior here before inserting any
1906        // fresh chunks so parse/read failures do not resurrect stale entries.
1907        self.remove_indexed_files(&requested_paths);
1908
1909        let existing_paths = requested_paths
1910            .iter()
1911            .filter(|path| path.exists())
1912            .cloned()
1913            .collect::<Vec<_>>();
1914        let deleted = requested_paths
1915            .iter()
1916            .filter(|path| !path.exists() && previously_indexed.contains(path.as_path()))
1917            .count();
1918
1919        if existing_paths.is_empty() {
1920            for path in &requested_paths {
1921                if !path.exists() {
1922                    self.deferred_files.remove(path);
1923                }
1924            }
1925            progress(0, 0);
1926            return Ok(InvalidatedFilesRefresh {
1927                completed_paths: requested_paths,
1928                summary: RefreshSummary {
1929                    deleted,
1930                    total_processed,
1931                    ..RefreshSummary::default()
1932                },
1933                ..InvalidatedFilesRefresh::default()
1934            });
1935        }
1936
1937        let (mut chunks, mut fresh_metadata) = Self::collect_chunks(project_root, &existing_paths);
1938
1939        let retained_file_count = self.file_mtimes.len();
1940        let changed_successful_count = existing_paths
1941            .iter()
1942            .filter(|path| {
1943                previously_indexed.contains(path.as_path()) && fresh_metadata.contains_key(*path)
1944            })
1945            .count();
1946        let available_new_files =
1947            max_files.saturating_sub(retained_file_count.saturating_add(changed_successful_count));
1948        let new_successful_files = existing_paths
1949            .iter()
1950            .filter(|path| {
1951                !previously_indexed.contains(path.as_path()) && fresh_metadata.contains_key(*path)
1952            })
1953            .cloned()
1954            .collect::<Vec<_>>();
1955        if new_successful_files.len() > available_new_files {
1956            let allowed_new_files = new_successful_files
1957                .iter()
1958                .take(available_new_files)
1959                .cloned()
1960                .collect::<HashSet<_>>();
1961            let deferred_new_files = new_successful_files
1962                .into_iter()
1963                .filter(|path| !allowed_new_files.contains(path))
1964                .collect::<HashSet<_>>();
1965
1966            fresh_metadata.retain(|file, _| {
1967                previously_indexed.contains(file.as_path()) || allowed_new_files.contains(file)
1968            });
1969            chunks.retain(|chunk| !deferred_new_files.contains(&chunk.file));
1970
1971            if !deferred_new_files.is_empty() {
1972                for path in &deferred_new_files {
1973                    self.deferred_files.insert(path.clone());
1974                }
1975                slog_warn!(
1976                    "semantic refresh deferred {} new file(s): indexed-file cap {} is reached",
1977                    deferred_new_files.len(),
1978                    max_files
1979                );
1980            }
1981        }
1982
1983        let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1984        for file in &successful_files {
1985            self.deferred_files.remove(file);
1986        }
1987        let changed = successful_files
1988            .iter()
1989            .filter(|path| previously_indexed.contains(path.as_path()))
1990            .count();
1991        let added = successful_files.len().saturating_sub(changed);
1992        let mut updated_metadata = Vec::with_capacity(fresh_metadata.len());
1993
1994        if chunks.is_empty() {
1995            progress(0, 0);
1996            for (file, metadata) in fresh_metadata {
1997                let freshness = FileFreshness {
1998                    mtime: metadata.mtime,
1999                    size: metadata.size,
2000                    content_hash: metadata.content_hash,
2001                };
2002                self.file_mtimes.insert(file.clone(), freshness.mtime);
2003                self.file_sizes.insert(file.clone(), freshness.size);
2004                self.file_hashes
2005                    .insert(file.clone(), freshness.content_hash);
2006                updated_metadata.push((file, freshness));
2007            }
2008
2009            return Ok(InvalidatedFilesRefresh {
2010                updated_metadata,
2011                completed_paths: requested_paths,
2012                summary: RefreshSummary {
2013                    changed,
2014                    added,
2015                    deleted,
2016                    total_processed,
2017                },
2018                ..InvalidatedFilesRefresh::default()
2019            });
2020        }
2021
2022        let initial_observed_dimension = if self.entries.is_empty() && previously_indexed.is_empty()
2023        {
2024            None
2025        } else {
2026            Some(self.dimension)
2027        };
2028        let (new_entries, observed_dimension) = Self::entries_for_chunks_with_reuse(
2029            chunks,
2030            &reuse_map,
2031            embed_fn,
2032            max_batch_size,
2033            initial_observed_dimension,
2034            "invalidated-file refresh",
2035            progress,
2036        )?;
2037
2038        let added_entries = new_entries.clone();
2039        self.entries.extend(new_entries);
2040        for (file, metadata) in fresh_metadata {
2041            let freshness = FileFreshness {
2042                mtime: metadata.mtime,
2043                size: metadata.size,
2044                content_hash: metadata.content_hash,
2045            };
2046            self.file_mtimes.insert(file.clone(), freshness.mtime);
2047            self.file_sizes.insert(file.clone(), freshness.size);
2048            self.file_hashes
2049                .insert(file.clone(), freshness.content_hash);
2050            updated_metadata.push((file, freshness));
2051        }
2052        if let Some(dim) = observed_dimension {
2053            self.dimension = dim;
2054        }
2055
2056        Ok(InvalidatedFilesRefresh {
2057            added_entries,
2058            updated_metadata,
2059            completed_paths: requested_paths,
2060            summary: RefreshSummary {
2061                changed,
2062                added,
2063                deleted,
2064                total_processed,
2065            },
2066        })
2067    }
2068
2069    pub fn apply_refresh_update(
2070        &mut self,
2071        added_entries: Vec<EmbeddingEntry>,
2072        updated_metadata: Vec<(PathBuf, FileFreshness)>,
2073        completed_paths: &[PathBuf],
2074    ) {
2075        // `added_entries` is the complete replacement set for completed paths:
2076        // freshly embedded misses plus reused chunks carrying refreshed metadata.
2077        // Removing first is safe only because producers include both kinds.
2078        self.remove_indexed_files(completed_paths);
2079
2080        let observed_dimension = added_entries.first().map(|entry| entry.vector.len());
2081        self.entries.extend(added_entries);
2082        for (file, freshness) in updated_metadata {
2083            self.file_mtimes.insert(file.clone(), freshness.mtime);
2084            self.file_sizes.insert(file.clone(), freshness.size);
2085            self.file_hashes.insert(file, freshness.content_hash);
2086        }
2087        if let Some(dim) = observed_dimension {
2088            self.dimension = dim;
2089        }
2090    }
2091
2092    fn remove_indexed_files(&mut self, files: &[PathBuf]) {
2093        let deleted_set: HashSet<&Path> = files.iter().map(PathBuf::as_path).collect();
2094        self.entries
2095            .retain(|entry| !deleted_set.contains(entry.chunk.file.as_path()));
2096        for path in files {
2097            self.file_mtimes.remove(path);
2098            self.file_sizes.remove(path);
2099            self.file_hashes.remove(path);
2100        }
2101    }
2102
2103    /// Search the index with a query embedding, returning top-K results sorted by relevance
2104    pub fn search(&self, query_vector: &[f32], top_k: usize) -> Vec<SemanticResult> {
2105        if self.entries.is_empty() || query_vector.len() != self.dimension {
2106            return Vec::new();
2107        }
2108
2109        let mut scored: Vec<(f32, usize)> = self
2110            .entries
2111            .iter()
2112            .enumerate()
2113            .map(|(i, entry)| {
2114                let mut score = cosine_similarity(query_vector, &entry.vector);
2115                if entry.chunk.exported {
2116                    score *= 1.1;
2117                }
2118                (score, i)
2119            })
2120            .collect();
2121
2122        let keep = top_k.min(scored.len());
2123        if keep == 0 {
2124            return Vec::new();
2125        }
2126
2127        if keep < scored.len() {
2128            scored.select_nth_unstable_by(keep, semantic_score_order);
2129            scored.truncate(keep);
2130        }
2131        scored.sort_by(semantic_score_order);
2132
2133        scored
2134            .into_iter()
2135            // Keep the selected best-first slice mapped without reintroducing the
2136            // old `> 0.0` floor: top_k has already been selected, and zero-score
2137            // tail entries remain observable when requested.
2138            .map(|(score, idx)| {
2139                let entry = &self.entries[idx];
2140                SemanticResult {
2141                    file: entry.chunk.file.clone(),
2142                    name: entry.chunk.name.clone(),
2143                    kind: entry.chunk.kind.clone(),
2144                    start_line: entry.chunk.start_line,
2145                    end_line: entry.chunk.end_line,
2146                    exported: entry.chunk.exported,
2147                    snippet: entry.chunk.snippet.clone(),
2148                    score,
2149                    source: "semantic",
2150                }
2151            })
2152            .collect()
2153    }
2154
2155    /// Number of indexed entries
2156    pub fn len(&self) -> usize {
2157        self.entries.len()
2158    }
2159
2160    /// Check if a file needs re-indexing based on mtime/size
2161    pub fn is_file_stale(&self, file: &Path) -> bool {
2162        let Some(stored_mtime) = self.file_mtimes.get(file) else {
2163            return true;
2164        };
2165        let Some(stored_size) = self.file_sizes.get(file) else {
2166            return true;
2167        };
2168        let Some(stored_hash) = self.file_hashes.get(file) else {
2169            return true;
2170        };
2171        let cached = FileFreshness {
2172            mtime: *stored_mtime,
2173            size: *stored_size,
2174            content_hash: *stored_hash,
2175        };
2176        match cache_freshness::verify_file_strict(file, &cached) {
2177            FreshnessVerdict::HotFresh => false,
2178            FreshnessVerdict::ContentFresh { .. } => false,
2179            FreshnessVerdict::Stale | FreshnessVerdict::Deleted => true,
2180        }
2181    }
2182
2183    fn backfill_missing_file_sizes(&mut self) {
2184        for path in self.file_mtimes.keys() {
2185            if self.file_sizes.contains_key(path) {
2186                continue;
2187            }
2188            if let Ok(metadata) = fs::metadata(path) {
2189                self.file_sizes.insert(path.clone(), metadata.len());
2190                if let Ok(Some(hash)) = cache_freshness::hash_file_if_small(path, metadata.len()) {
2191                    self.file_hashes.insert(path.clone(), hash);
2192                }
2193            }
2194        }
2195    }
2196
2197    /// Remove entries for a specific file
2198    pub fn remove_file(&mut self, file: &Path) {
2199        self.invalidate_file(file);
2200    }
2201
2202    pub fn invalidate_file(&mut self, file: &Path) {
2203        let canonical_file = canonicalize_existing_or_deleted_path(file);
2204        self.entries
2205            .retain(|e| e.chunk.file != file && e.chunk.file != canonical_file);
2206        self.file_mtimes.remove(file);
2207        self.file_sizes.remove(file);
2208        self.file_hashes.remove(file);
2209        if canonical_file.as_path() != file {
2210            self.file_mtimes.remove(&canonical_file);
2211            self.file_sizes.remove(&canonical_file);
2212            self.file_hashes.remove(&canonical_file);
2213        }
2214    }
2215
2216    /// Get the embedding dimension
2217    pub fn dimension(&self) -> usize {
2218        self.dimension
2219    }
2220
2221    pub fn fingerprint(&self) -> Option<&SemanticIndexFingerprint> {
2222        self.fingerprint.as_ref()
2223    }
2224
2225    pub fn backend_label(&self) -> Option<&str> {
2226        self.fingerprint.as_ref().map(|f| f.backend.as_str())
2227    }
2228
2229    pub fn model_label(&self) -> Option<&str> {
2230        self.fingerprint.as_ref().map(|f| f.model.as_str())
2231    }
2232
2233    pub fn set_fingerprint(&mut self, fingerprint: SemanticIndexFingerprint) {
2234        self.fingerprint = Some(fingerprint);
2235    }
2236
2237    /// Write the semantic index to disk using atomic temp+rename pattern
2238    pub fn write_to_disk(&self, storage_dir: &Path, project_key: &str) {
2239        // Don't persist empty indexes — they would be loaded on next startup
2240        // and prevent a fresh build that might find files.
2241        if self.entries.is_empty() {
2242            slog_info!("skipping semantic index persistence (0 entries)");
2243            return;
2244        }
2245        let dir = storage_dir.join("semantic").join(project_key);
2246        if let Err(e) = fs::create_dir_all(&dir) {
2247            slog_warn!("failed to create semantic cache dir: {}", e);
2248            return;
2249        }
2250        let data_path = dir.join("semantic.bin");
2251        let tmp_path = dir.join(format!(
2252            "semantic.bin.tmp.{}.{}",
2253            std::process::id(),
2254            SystemTime::now()
2255                .duration_since(SystemTime::UNIX_EPOCH)
2256                .unwrap_or(Duration::ZERO)
2257                .as_nanos()
2258        ));
2259        let write_result = (|| -> io::Result<usize> {
2260            let file = fs::File::create(&tmp_path)?;
2261            let mut writer = BufWriter::new(file);
2262            let bytes_written = self.write_to_writer(&mut writer)?;
2263            writer.flush()?;
2264            writer.get_ref().sync_all()?;
2265            Ok(bytes_written)
2266        })();
2267        let bytes_written = match write_result {
2268            Ok(bytes_written) => bytes_written,
2269            Err(e) => {
2270                slog_warn!("failed to write semantic index: {}", e);
2271                let _ = fs::remove_file(&tmp_path);
2272                return;
2273            }
2274        };
2275        if let Err(e) = fs::rename(&tmp_path, &data_path) {
2276            slog_warn!("failed to rename semantic index: {}", e);
2277            let _ = fs::remove_file(&tmp_path);
2278            return;
2279        }
2280        slog_info!(
2281            "semantic index persisted: {} entries, {:.1} KB",
2282            self.entries.len(),
2283            bytes_written as f64 / 1024.0
2284        );
2285    }
2286
2287    /// Read the semantic index from disk
2288    pub fn read_from_disk(
2289        storage_dir: &Path,
2290        project_key: &str,
2291        current_canonical_root: &Path,
2292        is_worktree_bridge: bool,
2293        expected_fingerprint: Option<&str>,
2294    ) -> Option<Self> {
2295        debug_assert!(current_canonical_root.is_absolute());
2296        let data_path = storage_dir
2297            .join("semantic")
2298            .join(project_key)
2299            .join("semantic.bin");
2300        let file = fs::File::open(&data_path).ok()?;
2301        let file_len = usize::try_from(file.metadata().ok()?.len()).ok()?;
2302        if file_len < HEADER_BYTES_V1 {
2303            slog_warn!(
2304                "corrupt semantic index (too small: {} bytes), removing",
2305                file_len
2306            );
2307            if !is_worktree_bridge {
2308                let _ = fs::remove_file(&data_path);
2309            }
2310            return None;
2311        }
2312
2313        let mut reader = BufReader::new(file);
2314        let mut version_buf = [0u8; 1];
2315        reader.read_exact(&mut version_buf).ok()?;
2316        let version = version_buf[0];
2317        if version != SEMANTIC_INDEX_VERSION_V6 {
2318            slog_info!(
2319                "cached semantic index version {} is older than {}, rebuilding",
2320                version,
2321                SEMANTIC_INDEX_VERSION_V6
2322            );
2323            if !is_worktree_bridge {
2324                let _ = fs::remove_file(&data_path);
2325            }
2326            return None;
2327        }
2328        match Self::from_reader_after_version(
2329            reader,
2330            version,
2331            current_canonical_root,
2332            Some(file_len),
2333            1,
2334        ) {
2335            Ok(index) => {
2336                if index.entries.is_empty() {
2337                    slog_info!("cached semantic index is empty, will rebuild");
2338                    if !is_worktree_bridge {
2339                        let _ = fs::remove_file(&data_path);
2340                    }
2341                    return None;
2342                }
2343                if let Some(expected) = expected_fingerprint {
2344                    let matches = index
2345                        .fingerprint()
2346                        .map(|fingerprint| fingerprint.matches_expected(expected))
2347                        .unwrap_or(false);
2348                    if !matches {
2349                        slog_info!("cached semantic index fingerprint mismatch, rebuilding");
2350                        if !is_worktree_bridge {
2351                            let _ = fs::remove_file(&data_path);
2352                        }
2353                        return None;
2354                    }
2355                }
2356                slog_info!(
2357                    "loaded semantic index from disk: {} entries",
2358                    index.entries.len()
2359                );
2360                Some(index)
2361            }
2362            Err(e) => {
2363                slog_warn!("corrupt semantic index, rebuilding: {}", e);
2364                if !is_worktree_bridge {
2365                    let _ = fs::remove_file(&data_path);
2366                }
2367                None
2368            }
2369        }
2370    }
2371
2372    /// Serialize the index to bytes for disk persistence
2373    pub fn to_bytes(&self) -> Vec<u8> {
2374        let mut buf = Vec::new();
2375        self.write_to_writer(&mut buf)
2376            .expect("writing semantic index to Vec cannot fail");
2377        buf
2378    }
2379
2380    fn write_to_writer<W: Write>(&self, writer: &mut W) -> io::Result<usize> {
2381        let mut bytes_written = 0usize;
2382        let fingerprint = self.fingerprint.as_ref().and_then(|fingerprint| {
2383            let encoded = fingerprint.as_string();
2384            if encoded.is_empty() {
2385                None
2386            } else {
2387                Some(encoded)
2388            }
2389        });
2390        let fp_bytes_ref = fingerprint.as_deref().map(str::as_bytes).unwrap_or(&[]);
2391        let file_mtime_count = self
2392            .file_mtimes
2393            .iter()
2394            .filter(|(path, _)| cache_relative_path(&self.project_root, path).is_some())
2395            .count();
2396        let entry_count = self
2397            .entries
2398            .iter()
2399            .filter(|entry| cache_relative_path(&self.project_root, &entry.chunk.file).is_some())
2400            .count();
2401
2402        // Header: version(1) + dimension(4) + entry_count(4) + fingerprint_len(4) + fingerprint
2403        //
2404        // V6 is the single write format. Layout extends V5:
2405        //   - fingerprint is always represented (absent ⇒ fingerprint_len=0,
2406        //     no bytes follow). Uniform format simplifies the reader.
2407        //   - paths are relative to project_root.
2408        //   - file metadata stored as secs(u64) + subsec_nanos(u32) + size(u64) + blake3(32).
2409        //     Preserves full APFS/ext4/NTFS precision and catches mtime ties.
2410        //
2411        // V1/V2 remain readable for backward compatibility (see from_bytes).
2412        // V3/V4 load as compatible formats but are rejected on disk so snippets
2413        // and file sizes are rebuilt once.
2414        let version = SEMANTIC_INDEX_VERSION_V6;
2415        write_counted(writer, &[version], &mut bytes_written)?;
2416        write_counted(
2417            writer,
2418            &(self.dimension as u32).to_le_bytes(),
2419            &mut bytes_written,
2420        )?;
2421        write_counted(
2422            writer,
2423            &(entry_count as u32).to_le_bytes(),
2424            &mut bytes_written,
2425        )?;
2426        write_counted(
2427            writer,
2428            &(fp_bytes_ref.len() as u32).to_le_bytes(),
2429            &mut bytes_written,
2430        )?;
2431        write_counted(writer, fp_bytes_ref, &mut bytes_written)?;
2432
2433        // File mtime table: count(4) + entries
2434        // V3 layout per entry: path_len(4) + path + secs(8) + subsec_nanos(4)
2435        write_counted(
2436            writer,
2437            &(file_mtime_count as u32).to_le_bytes(),
2438            &mut bytes_written,
2439        )?;
2440        for (path, mtime) in &self.file_mtimes {
2441            let Some(relative) = cache_relative_path(&self.project_root, path) else {
2442                continue;
2443            };
2444            let relative = relative.to_string_lossy();
2445            let path_bytes = relative.as_bytes();
2446            write_counted(
2447                writer,
2448                &(path_bytes.len() as u32).to_le_bytes(),
2449                &mut bytes_written,
2450            )?;
2451            write_counted(writer, path_bytes, &mut bytes_written)?;
2452            let duration = mtime
2453                .duration_since(SystemTime::UNIX_EPOCH)
2454                .unwrap_or_default();
2455            write_counted(
2456                writer,
2457                &duration.as_secs().to_le_bytes(),
2458                &mut bytes_written,
2459            )?;
2460            write_counted(
2461                writer,
2462                &duration.subsec_nanos().to_le_bytes(),
2463                &mut bytes_written,
2464            )?;
2465            let size = self.file_sizes.get(path).copied().unwrap_or_default();
2466            write_counted(writer, &size.to_le_bytes(), &mut bytes_written)?;
2467            let hash = self
2468                .file_hashes
2469                .get(path)
2470                .copied()
2471                .unwrap_or_else(cache_freshness::zero_hash);
2472            write_counted(writer, hash.as_bytes(), &mut bytes_written)?;
2473        }
2474
2475        // Entries: each is metadata + vector
2476        for entry in &self.entries {
2477            let Some(relative) = cache_relative_path(&self.project_root, &entry.chunk.file) else {
2478                continue;
2479            };
2480            let c = &entry.chunk;
2481
2482            // File path
2483            let relative = relative.to_string_lossy();
2484            let file_bytes = relative.as_bytes();
2485            write_counted(
2486                writer,
2487                &(file_bytes.len() as u32).to_le_bytes(),
2488                &mut bytes_written,
2489            )?;
2490            write_counted(writer, file_bytes, &mut bytes_written)?;
2491
2492            // Name
2493            let name_bytes = c.name.as_bytes();
2494            write_counted(
2495                writer,
2496                &(name_bytes.len() as u32).to_le_bytes(),
2497                &mut bytes_written,
2498            )?;
2499            write_counted(writer, name_bytes, &mut bytes_written)?;
2500
2501            // Kind (1 byte)
2502            write_counted(writer, &[symbol_kind_to_u8(&c.kind)], &mut bytes_written)?;
2503
2504            // Lines + exported
2505            write_counted(
2506                writer,
2507                &(c.start_line as u32).to_le_bytes(),
2508                &mut bytes_written,
2509            )?;
2510            write_counted(
2511                writer,
2512                &(c.end_line as u32).to_le_bytes(),
2513                &mut bytes_written,
2514            )?;
2515            write_counted(writer, &[c.exported as u8], &mut bytes_written)?;
2516
2517            // Snippet
2518            let snippet_bytes = c.snippet.as_bytes();
2519            write_counted(
2520                writer,
2521                &(snippet_bytes.len() as u32).to_le_bytes(),
2522                &mut bytes_written,
2523            )?;
2524            write_counted(writer, snippet_bytes, &mut bytes_written)?;
2525
2526            // Embed text
2527            let embed_bytes = c.embed_text.as_bytes();
2528            write_counted(
2529                writer,
2530                &(embed_bytes.len() as u32).to_le_bytes(),
2531                &mut bytes_written,
2532            )?;
2533            write_counted(writer, embed_bytes, &mut bytes_written)?;
2534
2535            // Vector (f32 array)
2536            for &val in &entry.vector {
2537                write_counted(writer, &val.to_le_bytes(), &mut bytes_written)?;
2538            }
2539        }
2540
2541        Ok(bytes_written)
2542    }
2543
2544    /// Deserialize the index from bytes
2545    pub fn from_bytes(data: &[u8], current_canonical_root: &Path) -> Result<Self, String> {
2546        debug_assert!(current_canonical_root.is_absolute());
2547        if data.len() < HEADER_BYTES_V1 {
2548            return Err("data too short".to_string());
2549        }
2550
2551        Self::from_reader_after_version(
2552            Cursor::new(&data[1..]),
2553            data[0],
2554            current_canonical_root,
2555            Some(data.len()),
2556            1,
2557        )
2558    }
2559
2560    fn from_reader_after_version<R: Read>(
2561        reader: R,
2562        version: u8,
2563        current_canonical_root: &Path,
2564        total_len: Option<usize>,
2565        bytes_read: usize,
2566    ) -> Result<Self, String> {
2567        debug_assert!(current_canonical_root.is_absolute());
2568        let mut reader = CountingReader::with_bytes_read(reader, bytes_read);
2569
2570        if version != SEMANTIC_INDEX_VERSION_V1
2571            && version != SEMANTIC_INDEX_VERSION_V2
2572            && version != SEMANTIC_INDEX_VERSION_V3
2573            && version != SEMANTIC_INDEX_VERSION_V4
2574            && version != SEMANTIC_INDEX_VERSION_V5
2575            && version != SEMANTIC_INDEX_VERSION_V6
2576        {
2577            return Err(format!("unsupported version: {}", version));
2578        }
2579        // V2 and newer share the same header layout (V3/V4/V5 only differ from
2580        // V2 in the per-mtime entry layout): version(1) + dimension(4) +
2581        // entry_count(4) + fingerprint_len(4) + fingerprint bytes.
2582        if (version == SEMANTIC_INDEX_VERSION_V2
2583            || version == SEMANTIC_INDEX_VERSION_V3
2584            || version == SEMANTIC_INDEX_VERSION_V4
2585            || version == SEMANTIC_INDEX_VERSION_V5
2586            || version == SEMANTIC_INDEX_VERSION_V6)
2587            && total_len.is_some_and(|len| len < HEADER_BYTES_V2)
2588        {
2589            return Err("data too short for semantic index v2/v3/v4/v5/v6 header".to_string());
2590        }
2591
2592        let dimension = read_u32_stream(&mut reader)? as usize;
2593        let entry_count = read_u32_stream(&mut reader)? as usize;
2594        validate_embedding_dimension(dimension)?;
2595        if entry_count > MAX_ENTRIES {
2596            return Err(format!("too many semantic index entries: {}", entry_count));
2597        }
2598
2599        // Fingerprint handling:
2600        //   - V1: no fingerprint field at all.
2601        //   - V2: fingerprint_len + fingerprint bytes; always present (writer
2602        //     only emitted V2 when fingerprint was Some).
2603        //   - V3+: fingerprint_len always present; fingerprint_len==0 ⇒ None.
2604        let has_fingerprint_field = version == SEMANTIC_INDEX_VERSION_V2
2605            || version == SEMANTIC_INDEX_VERSION_V3
2606            || version == SEMANTIC_INDEX_VERSION_V4
2607            || version == SEMANTIC_INDEX_VERSION_V5
2608            || version == SEMANTIC_INDEX_VERSION_V6;
2609        let fingerprint = if has_fingerprint_field {
2610            let fingerprint_len = read_u32_stream(&mut reader)? as usize;
2611            if total_len
2612                .is_some_and(|len| reader.bytes_read().saturating_add(fingerprint_len) > len)
2613            {
2614                return Err("unexpected end of data reading fingerprint".to_string());
2615            }
2616            if fingerprint_len == 0 {
2617                None
2618            } else {
2619                let mut raw = vec![0u8; fingerprint_len];
2620                read_exact_stream(
2621                    &mut reader,
2622                    &mut raw,
2623                    "unexpected end of data reading fingerprint",
2624                )?;
2625                let raw = String::from_utf8_lossy(&raw).to_string();
2626                Some(
2627                    serde_json::from_str::<SemanticIndexFingerprint>(&raw)
2628                        .map_err(|error| format!("invalid semantic fingerprint: {error}"))?,
2629                )
2630            }
2631        } else {
2632            None
2633        };
2634
2635        // File mtimes
2636        let mtime_count = read_u32_stream(&mut reader)? as usize;
2637        if mtime_count > MAX_ENTRIES {
2638            return Err(format!("too many semantic file mtimes: {}", mtime_count));
2639        }
2640
2641        let vector_bytes = entry_count
2642            .checked_mul(dimension)
2643            .and_then(|count| count.checked_mul(F32_BYTES))
2644            .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
2645        if total_len.is_some_and(|len| vector_bytes > len.saturating_sub(reader.bytes_read())) {
2646            return Err("semantic index vectors exceed available data".to_string());
2647        }
2648
2649        let mut file_mtimes = HashMap::with_capacity(mtime_count);
2650        let mut file_sizes = HashMap::with_capacity(mtime_count);
2651        let mut file_hashes = HashMap::with_capacity(mtime_count);
2652        for _ in 0..mtime_count {
2653            let path = read_string_stream(&mut reader, total_len)?;
2654            let secs = read_u64_stream(&mut reader)?;
2655            // V3+ persists subsec_nanos alongside secs so staleness checks
2656            // survive restart round-trips. V1/V2 load with 0 nanos, which
2657            // causes one rebuild on upgrade (they never matched live APFS
2658            // mtimes anyway — the bug v0.15.2 fixes). After that rebuild,
2659            // the cache is persisted as V3 and stabilises.
2660            let nanos = if version == SEMANTIC_INDEX_VERSION_V3
2661                || version == SEMANTIC_INDEX_VERSION_V4
2662                || version == SEMANTIC_INDEX_VERSION_V5
2663                || version == SEMANTIC_INDEX_VERSION_V6
2664            {
2665                read_u32_stream(&mut reader)?
2666            } else {
2667                0
2668            };
2669            let size =
2670                if version == SEMANTIC_INDEX_VERSION_V5 || version == SEMANTIC_INDEX_VERSION_V6 {
2671                    read_u64_stream(&mut reader)?
2672                } else {
2673                    0
2674                };
2675            let content_hash = if version == SEMANTIC_INDEX_VERSION_V6 {
2676                let mut hash_bytes = [0u8; 32];
2677                read_exact_stream(
2678                    &mut reader,
2679                    &mut hash_bytes,
2680                    "unexpected end of data reading content hash",
2681                )?;
2682                blake3::Hash::from_bytes(hash_bytes)
2683            } else {
2684                cache_freshness::zero_hash()
2685            };
2686            // Hardening against corrupt / maliciously crafted cache files
2687            // (v0.15.2). `Duration::new(secs, nanos)` can panic when the
2688            // nanosecond carry overflows the second counter, and
2689            // `SystemTime + Duration` can panic on carry past the platform's
2690            // upper bound. Explicit validation keeps a corrupted semantic.bin
2691            // from taking down the whole aft process.
2692            if nanos >= 1_000_000_000 {
2693                return Err(format!(
2694                    "invalid semantic mtime: nanos {} >= 1_000_000_000",
2695                    nanos
2696                ));
2697            }
2698            let duration = std::time::Duration::new(secs, nanos);
2699            let mtime = SystemTime::UNIX_EPOCH
2700                .checked_add(duration)
2701                .ok_or_else(|| {
2702                    format!(
2703                        "invalid semantic mtime: secs={} nanos={} overflows SystemTime",
2704                        secs, nanos
2705                    )
2706                })?;
2707            let path = if version == SEMANTIC_INDEX_VERSION_V6 {
2708                cached_path_under_root(current_canonical_root, &PathBuf::from(path))
2709                    .ok_or_else(|| "cached semantic mtime path escapes project root".to_string())?
2710            } else {
2711                PathBuf::from(path)
2712            };
2713            file_mtimes.insert(path.clone(), mtime);
2714            file_sizes.insert(path.clone(), size);
2715            file_hashes.insert(path, content_hash);
2716        }
2717
2718        // Entries
2719        let mut entries = Vec::with_capacity(entry_count);
2720        for _ in 0..entry_count {
2721            let raw_file = PathBuf::from(read_string_stream(&mut reader, total_len)?);
2722            let file = if version == SEMANTIC_INDEX_VERSION_V6 {
2723                cached_path_under_root(current_canonical_root, &raw_file)
2724                    .ok_or_else(|| "cached semantic entry path escapes project root".to_string())?
2725            } else {
2726                raw_file
2727            };
2728            let name = read_string_stream(&mut reader, total_len)?;
2729
2730            let kind = u8_to_symbol_kind(read_u8_stream(&mut reader, "unexpected end of data")?);
2731
2732            let start_line = read_u32_stream(&mut reader)?;
2733            let end_line = read_u32_stream(&mut reader)?;
2734
2735            let exported = read_u8_stream(&mut reader, "unexpected end of data")? != 0;
2736
2737            let snippet = read_string_stream(&mut reader, total_len)?;
2738            let embed_text = read_string_stream(&mut reader, total_len)?;
2739
2740            // Vector
2741            let vec_bytes = dimension
2742                .checked_mul(F32_BYTES)
2743                .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
2744            if total_len.is_some_and(|len| reader.bytes_read().saturating_add(vec_bytes) > len) {
2745                return Err("unexpected end of data reading vector".to_string());
2746            }
2747            let mut vector = Vec::with_capacity(dimension);
2748            for _ in 0..dimension {
2749                let mut bytes = [0u8; F32_BYTES];
2750                read_exact_stream(
2751                    &mut reader,
2752                    &mut bytes,
2753                    "unexpected end of data reading vector",
2754                )?;
2755                vector.push(f32::from_le_bytes(bytes));
2756            }
2757
2758            entries.push(EmbeddingEntry {
2759                chunk: SemanticChunk {
2760                    file,
2761                    name,
2762                    kind,
2763                    start_line,
2764                    end_line,
2765                    exported,
2766                    embed_text,
2767                    snippet,
2768                },
2769                vector,
2770            });
2771        }
2772
2773        if entries.len() != entry_count {
2774            return Err(format!(
2775                "semantic cache entry count drift: header={} decoded={}",
2776                entry_count,
2777                entries.len()
2778            ));
2779        }
2780        for entry in &entries {
2781            if !file_mtimes.contains_key(&entry.chunk.file) {
2782                return Err(format!(
2783                    "semantic cache metadata missing for entry file {}",
2784                    entry.chunk.file.display()
2785                ));
2786            }
2787        }
2788
2789        Ok(Self {
2790            entries,
2791            file_mtimes,
2792            file_sizes,
2793            file_hashes,
2794            dimension,
2795            fingerprint,
2796            project_root: current_canonical_root.to_path_buf(),
2797            deferred_files: HashSet::new(),
2798        })
2799    }
2800}
2801
2802fn write_counted<W: Write>(
2803    writer: &mut W,
2804    bytes: &[u8],
2805    bytes_written: &mut usize,
2806) -> io::Result<()> {
2807    writer.write_all(bytes)?;
2808    *bytes_written = bytes_written.saturating_add(bytes.len());
2809    Ok(())
2810}
2811
2812struct CountingReader<R> {
2813    inner: R,
2814    bytes_read: usize,
2815}
2816
2817impl<R> CountingReader<R> {
2818    fn with_bytes_read(inner: R, bytes_read: usize) -> Self {
2819        Self { inner, bytes_read }
2820    }
2821
2822    fn bytes_read(&self) -> usize {
2823        self.bytes_read
2824    }
2825}
2826
2827impl<R: Read> Read for CountingReader<R> {
2828    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
2829        let read = self.inner.read(buf)?;
2830        self.bytes_read = self.bytes_read.saturating_add(read);
2831        Ok(read)
2832    }
2833}
2834
2835fn read_exact_stream<R: Read>(
2836    reader: &mut CountingReader<R>,
2837    buf: &mut [u8],
2838    eof_message: &'static str,
2839) -> Result<(), String> {
2840    reader.read_exact(buf).map_err(|error| {
2841        if error.kind() == io::ErrorKind::UnexpectedEof {
2842            eof_message.to_string()
2843        } else {
2844            format!("{eof_message}: {error}")
2845        }
2846    })
2847}
2848
2849fn read_u8_stream<R: Read>(
2850    reader: &mut CountingReader<R>,
2851    eof_message: &'static str,
2852) -> Result<u8, String> {
2853    let mut bytes = [0u8; 1];
2854    read_exact_stream(reader, &mut bytes, eof_message)?;
2855    Ok(bytes[0])
2856}
2857
2858fn read_u32_stream<R: Read>(reader: &mut CountingReader<R>) -> Result<u32, String> {
2859    let mut bytes = [0u8; 4];
2860    read_exact_stream(reader, &mut bytes, "unexpected end of data reading u32")?;
2861    Ok(u32::from_le_bytes(bytes))
2862}
2863
2864fn read_u64_stream<R: Read>(reader: &mut CountingReader<R>) -> Result<u64, String> {
2865    let mut bytes = [0u8; 8];
2866    read_exact_stream(reader, &mut bytes, "unexpected end of data reading u64")?;
2867    Ok(u64::from_le_bytes(bytes))
2868}
2869
2870fn read_string_stream<R: Read>(
2871    reader: &mut CountingReader<R>,
2872    total_len: Option<usize>,
2873) -> Result<String, String> {
2874    let len = read_u32_stream(reader)? as usize;
2875    if total_len.is_some_and(|total_len| reader.bytes_read().saturating_add(len) > total_len) {
2876        return Err("unexpected end of data reading string".to_string());
2877    }
2878    let mut bytes = vec![0u8; len];
2879    read_exact_stream(reader, &mut bytes, "unexpected end of data reading string")?;
2880    Ok(String::from_utf8_lossy(&bytes).to_string())
2881}
2882
2883/// Build enriched embedding text from a symbol with cAST-style context
2884fn build_embed_text(symbol: &Symbol, source: &str, file: &Path, project_root: &Path) -> String {
2885    let relative = file
2886        .strip_prefix(project_root)
2887        .unwrap_or(file)
2888        .to_string_lossy();
2889
2890    let kind_label = match &symbol.kind {
2891        SymbolKind::Function => "function",
2892        SymbolKind::Class => "class",
2893        SymbolKind::Method => "method",
2894        SymbolKind::Struct => "struct",
2895        SymbolKind::Interface => "interface",
2896        SymbolKind::Enum => "enum",
2897        SymbolKind::TypeAlias => "type",
2898        SymbolKind::Variable => "variable",
2899        SymbolKind::Heading => "heading",
2900        SymbolKind::FileSummary => "file-summary",
2901    };
2902
2903    // Build: "file:relative/path kind:function name:validateAuth signature:fn validateAuth(token: &str) -> bool"
2904    let name = &symbol.name;
2905    let mut text = format!(
2906        "name:{name} file:{} kind:{} name:{name}",
2907        relative, kind_label
2908    );
2909
2910    if let Some(sig) = &symbol.signature {
2911        // Cap the signature: structured parsers (e.g. YAML/Kubernetes) pack
2912        // entire inline scripts (CronJob/Job `command:` bodies, multi-KB) into
2913        // the signature. Appending it unbounded produces a single embed_text
2914        // that overflows the embedding backend's physical batch (e.g. a
2915        // llama.cpp server's 512-token cap), aborting the whole index build
2916        // and silently degrading every search to lexical. 400 chars keeps the
2917        // identifying head of the signature without blowing the budget.
2918        text.push_str(&format!(" signature:{}", truncate_chars(sig, 400)));
2919    }
2920
2921    // Add body snippet (first ~300 chars of symbol body)
2922    let lines: Vec<&str> = source.lines().collect();
2923    let start = (symbol.range.start_line as usize).min(lines.len());
2924    // range.end_line is inclusive 0-based; +1 makes it an exclusive slice bound.
2925    let end = (symbol.range.end_line as usize + 1).min(lines.len());
2926    if start < end {
2927        let body: String = lines[start..end]
2928            .iter()
2929            .take(15) // max 15 lines
2930            .copied()
2931            .collect::<Vec<&str>>()
2932            .join("\n");
2933        let snippet = if body.len() > 300 {
2934            format!("{}...", &body[..body.floor_char_boundary(300)])
2935        } else {
2936            body
2937        };
2938        text.push_str(&format!(" body:{}", snippet));
2939    }
2940
2941    // Final defense-in-depth clamp: no single embed_text may exceed the
2942    // backend's per-input budget regardless of which field grew. Most
2943    // backends cap a physical batch around 512 tokens; ~1600 chars stays
2944    // comfortably under that for typical English/code (≈4 chars/token).
2945    truncate_chars(&text, MAX_EMBED_TEXT_CHARS)
2946}
2947
2948/// Upper bound on characters in a single chunk's `embed_text`. Keeps any one
2949/// input below typical embedding-backend physical batch limits (~512 tokens)
2950/// so an oversized symbol cannot abort the whole index build.
2951const MAX_EMBED_TEXT_CHARS: usize = 1600;
2952
2953fn truncate_chars(value: &str, max_chars: usize) -> String {
2954    value.chars().take(max_chars).collect()
2955}
2956
2957fn first_leading_doc_comment(source: &str) -> String {
2958    let lines: Vec<&str> = source.lines().collect();
2959    let Some((start, first)) = lines
2960        .iter()
2961        .enumerate()
2962        .find(|(_, line)| !line.trim().is_empty())
2963    else {
2964        return String::new();
2965    };
2966
2967    let trimmed = first.trim_start();
2968    if trimmed.starts_with("/**") {
2969        let mut comment = Vec::new();
2970        for line in lines.iter().skip(start) {
2971            comment.push(*line);
2972            if line.contains("*/") {
2973                break;
2974            }
2975        }
2976        return truncate_chars(&comment.join("\n"), 200);
2977    }
2978
2979    if trimmed.starts_with("///") || trimmed.starts_with("//!") {
2980        let comment = lines
2981            .iter()
2982            .skip(start)
2983            .take_while(|line| {
2984                let trimmed = line.trim_start();
2985                trimmed.starts_with("///") || trimmed.starts_with("//!")
2986            })
2987            .copied()
2988            .collect::<Vec<_>>()
2989            .join("\n");
2990        return truncate_chars(&comment, 200);
2991    }
2992
2993    String::new()
2994}
2995
2996pub fn build_file_summary_chunk(
2997    file: &Path,
2998    project_root: &Path,
2999    source: &str,
3000    top_exports: &[&str],
3001    top_export_signatures: &[Option<&str>],
3002) -> SemanticChunk {
3003    let relative = file.strip_prefix(project_root).unwrap_or(file);
3004    let rel_path = relative.to_string_lossy();
3005    let parent_dir = relative
3006        .parent()
3007        .map(|parent| parent.to_string_lossy().to_string())
3008        .unwrap_or_default();
3009    let name = file
3010        .file_stem()
3011        .map(|stem| stem.to_string_lossy().to_string())
3012        .unwrap_or_default();
3013    let doc = first_leading_doc_comment(source);
3014    let exports = top_exports
3015        .iter()
3016        .take(5)
3017        .copied()
3018        .collect::<Vec<_>>()
3019        .join(",");
3020    let snippet = if doc.is_empty() {
3021        top_export_signatures
3022            .first()
3023            .and_then(|signature| signature.as_deref())
3024            .map(|signature| truncate_chars(signature, 200))
3025            .unwrap_or_default()
3026    } else {
3027        doc.clone()
3028    };
3029
3030    SemanticChunk {
3031        file: file.to_path_buf(),
3032        name,
3033        kind: SymbolKind::FileSummary,
3034        start_line: 0,
3035        end_line: 0,
3036        exported: false,
3037        embed_text: truncate_chars(
3038            &format!(
3039                "file:{rel_path} kind:file-summary name:{} parent:{parent_dir} doc:{doc} exports:{exports}",
3040                file.file_stem()
3041                    .map(|stem| stem.to_string_lossy().to_string())
3042                    .unwrap_or_default()
3043            ),
3044            MAX_EMBED_TEXT_CHARS,
3045        ),
3046        snippet,
3047    }
3048}
3049
3050fn parser_for(
3051    parsers: &mut HashMap<crate::parser::LangId, Parser>,
3052    lang: crate::parser::LangId,
3053) -> Result<&mut Parser, String> {
3054    use std::collections::hash_map::Entry;
3055
3056    match parsers.entry(lang) {
3057        Entry::Occupied(entry) => Ok(entry.into_mut()),
3058        Entry::Vacant(entry) => {
3059            let grammar = grammar_for(lang);
3060            let mut parser = Parser::new();
3061            parser
3062                .set_language(&grammar)
3063                .map_err(|error| error.to_string())?;
3064            Ok(entry.insert(parser))
3065        }
3066    }
3067}
3068
3069pub fn is_semantic_indexed_extension(path: &Path) -> bool {
3070    matches!(
3071        path.extension().and_then(|extension| extension.to_str()),
3072        Some(
3073            "ts" | "tsx"
3074                | "js"
3075                | "jsx"
3076                | "py"
3077                | "rs"
3078                | "go"
3079                | "c"
3080                | "h"
3081                | "cc"
3082                | "cpp"
3083                | "cxx"
3084                | "hpp"
3085                | "hh"
3086                | "zig"
3087                | "cs"
3088                | "sh"
3089                | "bash"
3090                | "zsh"
3091                | "inc"
3092                | "php"
3093                | "sol"
3094                | "scss"
3095                | "vue"
3096                | "yaml"
3097                | "yml"
3098        )
3099    )
3100}
3101
3102fn collect_file_metadata(file: &Path) -> Result<IndexedFileMetadata, String> {
3103    let metadata = fs::metadata(file).map_err(|error| error.to_string())?;
3104    let mtime = metadata.modified().map_err(|error| error.to_string())?;
3105    let content_hash = cache_freshness::hash_file_if_small(file, metadata.len())
3106        .map_err(|error| error.to_string())?
3107        .unwrap_or_else(cache_freshness::zero_hash);
3108    Ok(IndexedFileMetadata {
3109        mtime,
3110        size: metadata.len(),
3111        content_hash,
3112    })
3113}
3114
3115fn canonicalize_existing_or_deleted_path(path: &Path) -> PathBuf {
3116    if let Ok(canonical) = fs::canonicalize(path) {
3117        return canonical;
3118    }
3119
3120    let Some(parent) = path.parent() else {
3121        return path.to_path_buf();
3122    };
3123    let Some(file_name) = path.file_name() else {
3124        return path.to_path_buf();
3125    };
3126
3127    fs::canonicalize(parent)
3128        .map(|canonical_parent| canonical_parent.join(file_name))
3129        .unwrap_or_else(|_| path.to_path_buf())
3130}
3131
3132/// Files larger than this are skipped for semantic chunking. The read +
3133/// tree-sitter parse is transiently O(file size) (tree-sitter can use several×
3134/// the source bytes), and `par_iter` collection parses many files at once, so an
3135/// unbounded read here is an OOM vector on a repo with a few multi-MB generated/
3136/// vendored/minified files. A file this large yields almost no useful embedding
3137/// anyway (each chunk's embed_text is clamped to MAX_EMBED_TEXT_CHARS), so we
3138/// track it (0 chunks) instead of reading it — freshness then skips it on later
3139/// refreshes. 4 MiB keeps essentially all hand-written source while capping the
3140/// pathological tail.
3141const MAX_SEMANTIC_FILE_BYTES: u64 = 4 * 1024 * 1024;
3142
3143fn collect_file_chunks(
3144    project_root: &Path,
3145    file: &Path,
3146    parsers: &mut HashMap<crate::parser::LangId, Parser>,
3147) -> Result<Vec<SemanticChunk>, String> {
3148    if !is_semantic_indexed_extension(file) {
3149        return Err("unsupported file extension".to_string());
3150    }
3151    let lang = detect_language(file).ok_or_else(|| "unsupported file extension".to_string())?;
3152    // OOM backstop: skip oversized files before the read + parse (tracked with
3153    // zero chunks by the caller, so freshness won't re-read them every refresh).
3154    if std::fs::metadata(file).is_ok_and(|m| m.len() > MAX_SEMANTIC_FILE_BYTES) {
3155        return Ok(Vec::new());
3156    }
3157    let source = std::fs::read_to_string(file).map_err(|error| error.to_string())?;
3158    let tree = parser_for(parsers, lang)?
3159        .parse(&source, None)
3160        .ok_or_else(|| format!("tree-sitter parse returned None for {}", file.display()))?;
3161    let symbols =
3162        extract_symbols_from_tree(&source, &tree, lang).map_err(|error| error.to_string())?;
3163
3164    Ok(symbols_to_chunks(file, &symbols, &source, project_root))
3165}
3166
3167/// Build a display snippet from a symbol's source
3168fn build_snippet(symbol: &Symbol, source: &str) -> String {
3169    let lines: Vec<&str> = source.lines().collect();
3170    let start = (symbol.range.start_line as usize).min(lines.len());
3171    // range.end_line is inclusive 0-based; +1 makes it an exclusive slice bound.
3172    let end = (symbol.range.end_line as usize + 1).min(lines.len());
3173    if start < end {
3174        let snippet_lines: Vec<&str> = lines[start..end].iter().take(5).copied().collect();
3175        let mut snippet = snippet_lines.join("\n");
3176        if end - start > 5 {
3177            snippet.push_str("\n  ...");
3178        }
3179        if snippet.len() > 300 {
3180            snippet = format!("{}...", &snippet[..snippet.floor_char_boundary(300)]);
3181        }
3182        snippet
3183    } else {
3184        String::new()
3185    }
3186}
3187
3188/// Convert symbols to semantic chunks with enriched context
3189fn symbols_to_chunks(
3190    file: &Path,
3191    symbols: &[Symbol],
3192    source: &str,
3193    project_root: &Path,
3194) -> Vec<SemanticChunk> {
3195    let mut chunks = Vec::new();
3196    let top_exports_with_signatures = symbols
3197        .iter()
3198        .filter(|symbol| {
3199            symbol.exported
3200                && symbol.parent.is_none()
3201                && !matches!(symbol.kind, SymbolKind::Heading)
3202        })
3203        .map(|symbol| (symbol.name.as_str(), symbol.signature.as_deref()))
3204        .collect::<Vec<_>>();
3205
3206    let has_only_headings = !symbols.is_empty()
3207        && symbols
3208            .iter()
3209            .all(|symbol| matches!(symbol.kind, SymbolKind::Heading));
3210    if top_exports_with_signatures.len() <= 2 && !has_only_headings {
3211        let top_exports = top_exports_with_signatures
3212            .iter()
3213            .map(|(name, _)| *name)
3214            .collect::<Vec<_>>();
3215        let top_export_signatures = top_exports_with_signatures
3216            .iter()
3217            .map(|(_, signature)| *signature)
3218            .collect::<Vec<_>>();
3219        chunks.push(build_file_summary_chunk(
3220            file,
3221            project_root,
3222            source,
3223            &top_exports,
3224            &top_export_signatures,
3225        ));
3226    }
3227
3228    for symbol in symbols {
3229        // Skip Markdown / HTML heading chunks: empirically they dominate result
3230        // lists even for code-shaped queries because heading prose embeds well.
3231        // Agents querying for code lose the actual matches under doc noise.
3232        // README/docs queries are still served by grep on the same files.
3233        if matches!(symbol.kind, SymbolKind::Heading) {
3234            continue;
3235        }
3236
3237        // Skip very small symbols (single-line variables, etc.)
3238        let line_count = symbol
3239            .range
3240            .end_line
3241            .saturating_sub(symbol.range.start_line)
3242            + 1;
3243        if line_count < 2 && !matches!(symbol.kind, SymbolKind::Variable) {
3244            continue;
3245        }
3246
3247        let embed_text = build_embed_text(symbol, source, file, project_root);
3248        let snippet = build_snippet(symbol, source);
3249
3250        chunks.push(SemanticChunk {
3251            file: file.to_path_buf(),
3252            name: symbol.name.clone(),
3253            kind: symbol.kind.clone(),
3254            start_line: symbol.range.start_line,
3255            end_line: symbol.range.end_line,
3256            exported: symbol.exported,
3257            embed_text,
3258            snippet,
3259        });
3260
3261        // Note: Nested symbols are handled separately by the outline system
3262        // Each symbol is indexed individually
3263    }
3264
3265    chunks
3266}
3267
3268fn semantic_score_order(a: &(f32, usize), b: &(f32, usize)) -> std::cmp::Ordering {
3269    b.0.partial_cmp(&a.0)
3270        .unwrap_or(std::cmp::Ordering::Equal)
3271        .then_with(|| a.1.cmp(&b.1))
3272}
3273
3274/// Cosine similarity between two vectors
3275fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
3276    if a.len() != b.len() {
3277        return 0.0;
3278    }
3279
3280    let mut dot = 0.0f32;
3281    let mut norm_a = 0.0f32;
3282    let mut norm_b = 0.0f32;
3283
3284    for i in 0..a.len() {
3285        dot += a[i] * b[i];
3286        norm_a += a[i] * a[i];
3287        norm_b += b[i] * b[i];
3288    }
3289
3290    let denom = norm_a.sqrt() * norm_b.sqrt();
3291    if denom == 0.0 {
3292        0.0
3293    } else {
3294        dot / denom
3295    }
3296}
3297
3298// Serialization helpers
3299fn symbol_kind_to_u8(kind: &SymbolKind) -> u8 {
3300    match kind {
3301        SymbolKind::Function => 0,
3302        SymbolKind::Class => 1,
3303        SymbolKind::Method => 2,
3304        SymbolKind::Struct => 3,
3305        SymbolKind::Interface => 4,
3306        SymbolKind::Enum => 5,
3307        SymbolKind::TypeAlias => 6,
3308        SymbolKind::Variable => 7,
3309        SymbolKind::Heading => 8,
3310        SymbolKind::FileSummary => 9,
3311    }
3312}
3313
3314fn u8_to_symbol_kind(v: u8) -> SymbolKind {
3315    match v {
3316        0 => SymbolKind::Function,
3317        1 => SymbolKind::Class,
3318        2 => SymbolKind::Method,
3319        3 => SymbolKind::Struct,
3320        4 => SymbolKind::Interface,
3321        5 => SymbolKind::Enum,
3322        6 => SymbolKind::TypeAlias,
3323        7 => SymbolKind::Variable,
3324        8 => SymbolKind::Heading,
3325        9 => SymbolKind::FileSummary,
3326        _ => SymbolKind::Heading,
3327    }
3328}
3329
3330#[cfg(test)]
3331mod tests {
3332    use super::*;
3333    use crate::config::{SemanticBackend, SemanticBackendConfig};
3334    use crate::parser::FileParser;
3335    use std::io::{Read, Write};
3336    use std::net::TcpListener;
3337    use std::thread;
3338
3339    #[test]
3340    fn semantic_index_includes_php_inc_and_scss_extensions() {
3341        for file in ["partial.inc", "index.php", "styles.scss"] {
3342            assert!(
3343                is_semantic_indexed_extension(Path::new(file)),
3344                "{file} should be semantic-index eligible"
3345            );
3346        }
3347    }
3348
3349    #[test]
3350    fn transient_marker_round_trips_and_classifies() {
3351        // A marked transient error is recognized and the marker is stripped for
3352        // display, leaving a clean message.
3353        let marked = format!("{TRANSIENT_EMBEDDING_MARKER}openai compatible request failed: error sending request for url (http://localhost:1234/v1/embeddings)");
3354        assert!(embedding_failure_is_transient(&marked));
3355        let clean = strip_transient_embedding_marker(&marked);
3356        assert!(!clean.contains(TRANSIENT_EMBEDDING_MARKER));
3357        assert!(clean.starts_with("openai compatible request failed:"));
3358
3359        // Permanent errors (HTTP 4xx, dimension mismatch) carry no marker and
3360        // are not classified transient — they must fail fast.
3361        for permanent in [
3362            "openai compatible request failed (HTTP 401): Unauthorized",
3363            "embedding dimension mismatch: index has 384, model returned 768",
3364            "too many files (>20000) for semantic indexing (max 20000)",
3365        ] {
3366            assert!(
3367                !embedding_failure_is_transient(permanent),
3368                "{permanent:?} must not be transient"
3369            );
3370            // Stripping a marker-free string is a no-op.
3371            assert_eq!(strip_transient_embedding_marker(permanent), permanent);
3372        }
3373    }
3374
3375    #[test]
3376    fn send_error_transience_separates_connect_timeout_from_4xx() {
3377        // 5xx / 429 are transient; other client errors are not.
3378        assert!(is_retryable_embedding_status(
3379            reqwest::StatusCode::INTERNAL_SERVER_ERROR
3380        ));
3381        assert!(is_retryable_embedding_status(
3382            reqwest::StatusCode::TOO_MANY_REQUESTS
3383        ));
3384        assert!(!is_retryable_embedding_status(
3385            reqwest::StatusCode::UNAUTHORIZED
3386        ));
3387        assert!(!is_retryable_embedding_status(
3388            reqwest::StatusCode::BAD_REQUEST
3389        ));
3390    }
3391
3392    #[test]
3393    fn local_backend_model_loading_body_is_transient() {
3394        // LM Studio / Ollama return a 4xx with a loading/unloaded message while
3395        // the model swaps; these must classify transient so the build self-heals.
3396        for body in [
3397            r#"{"error":"Model was unloaded while the request was still in queue.."}"#,
3398            r#"{"error":"model is loading, please wait"}"#,
3399            r#"{"error":"Model not loaded"}"#,
3400            "Loading model into memory",
3401        ] {
3402            assert!(
3403                embedding_response_body_is_transient(reqwest::StatusCode::BAD_REQUEST, body),
3404                "{body:?} should be body-transient"
3405            );
3406        }
3407
3408        // A genuine 4xx misconfiguration body must NOT be treated as transient,
3409        // even when it happens to contain generic words from the old broad
3410        // substring matcher.
3411        for body in [
3412            r#"{"error":"invalid api key"}"#,
3413            r#"{"error":"model 'foo' not found"}"#,
3414            "Bad Request: unknown field",
3415            "Bad Request: invalid loading model option",
3416            r#"{"error":"unauthorized while model is being loaded by another account"}"#,
3417        ] {
3418            assert!(
3419                !embedding_response_body_is_transient(reqwest::StatusCode::BAD_REQUEST, body),
3420                "{body:?} must not be body-transient"
3421            );
3422        }
3423
3424        assert!(
3425            !embedding_response_body_is_transient(
3426                reqwest::StatusCode::UNAUTHORIZED,
3427                r#"{"error":"model is loading, please wait"}"#
3428            ),
3429            "permanent auth failures must not become transient because of body text"
3430        );
3431    }
3432
3433    fn start_mock_http_server<F>(handler: F) -> (String, thread::JoinHandle<()>)
3434    where
3435        F: Fn(String, String, String) -> String + Send + 'static,
3436    {
3437        let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
3438        let addr = listener.local_addr().expect("local addr");
3439        let handle = thread::spawn(move || {
3440            let (mut stream, _) = listener.accept().expect("accept request");
3441            let mut buf = Vec::new();
3442            let mut chunk = [0u8; 4096];
3443            let mut header_end = None;
3444            let mut content_length = 0usize;
3445            loop {
3446                let n = stream.read(&mut chunk).expect("read request");
3447                if n == 0 {
3448                    break;
3449                }
3450                buf.extend_from_slice(&chunk[..n]);
3451                if header_end.is_none() {
3452                    if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
3453                        header_end = Some(pos + 4);
3454                        let headers = String::from_utf8_lossy(&buf[..pos + 4]);
3455                        for line in headers.lines() {
3456                            if let Some(value) = line.strip_prefix("Content-Length:") {
3457                                content_length = value.trim().parse::<usize>().unwrap_or(0);
3458                            }
3459                        }
3460                    }
3461                }
3462                if let Some(end) = header_end {
3463                    if buf.len() >= end + content_length {
3464                        break;
3465                    }
3466                }
3467            }
3468
3469            let end = header_end.expect("header terminator");
3470            let request = String::from_utf8_lossy(&buf[..end]).to_string();
3471            let body = String::from_utf8_lossy(&buf[end..end + content_length]).to_string();
3472            let mut lines = request.lines();
3473            let request_line = lines.next().expect("request line").to_string();
3474            let path = request_line
3475                .split_whitespace()
3476                .nth(1)
3477                .expect("request path")
3478                .to_string();
3479            let response_body = handler(request_line, path, body);
3480            let response = format!(
3481                "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
3482                response_body.len(),
3483                response_body
3484            );
3485            stream
3486                .write_all(response.as_bytes())
3487                .expect("write response");
3488        });
3489
3490        (format!("http://{}", addr), handle)
3491    }
3492
3493    fn start_truncated_body_server(attempts: usize) -> (String, thread::JoinHandle<()>) {
3494        let listener = TcpListener::bind("127.0.0.1:0").expect("bind truncated test server");
3495        listener
3496            .set_nonblocking(true)
3497            .expect("nonblocking listener");
3498        let addr = listener.local_addr().expect("local addr");
3499        let handle = thread::spawn(move || {
3500            let deadline = std::time::Instant::now() + Duration::from_secs(2);
3501            let mut accepted = 0usize;
3502            while accepted < attempts && std::time::Instant::now() < deadline {
3503                match listener.accept() {
3504                    Ok((mut stream, _)) => {
3505                        accepted += 1;
3506                        let mut buf = [0u8; 4096];
3507                        // The client (under test) uses a 250ms timeout and drops
3508                        // the connection when the truncated body never completes.
3509                        // On Windows that disconnect surfaces as a hard socket
3510                        // error (WSAECONNRESET) on these read/write calls, where
3511                        // Unix returns a clean EOF. Tolerate both: the mock does
3512                        // not need the request bytes, and a write to an
3513                        // already-hung-up client is expected.
3514                        let _ = stream.read(&mut buf);
3515                        let response = "HTTP/1.1 200 OK
3516Content-Type: application/json
3517Content-Length: 128
3518Connection: close
3519
3520{";
3521                        let _ = stream.write_all(response.as_bytes());
3522                    }
3523                    Err(error) if error.kind() == std::io::ErrorKind::WouldBlock => {
3524                        thread::sleep(Duration::from_millis(10));
3525                    }
3526                    Err(error) => panic!("accept request: {error}"),
3527                }
3528            }
3529        });
3530
3531        (format!("http://{}", addr), handle)
3532    }
3533
3534    #[test]
3535    fn response_body_read_failures_are_marked_transient() {
3536        let (url, handle) = start_truncated_body_server(EMBEDDING_REQUEST_MAX_ATTEMPTS);
3537        let client = Client::builder()
3538            .timeout(Duration::from_millis(250))
3539            .build()
3540            .expect("client");
3541
3542        let error = send_embedding_request(|| client.post(&url).body("{}"), "test backend")
3543            .expect_err("truncated body should fail");
3544
3545        handle.join().unwrap();
3546        assert!(
3547            embedding_failure_is_transient(&error),
3548            "body read failures should be transient-marked: {error}"
3549        );
3550        assert!(error.contains("response read failed"));
3551    }
3552
3553    fn test_vector_for_texts(texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
3554        Ok(texts.iter().map(|_| vec![1.0, 0.0, 0.0]).collect())
3555    }
3556
3557    fn write_rust_file(path: &Path, function_name: &str) {
3558        fs::write(
3559            path,
3560            format!("pub fn {function_name}() -> bool {{\n    true\n}}\n"),
3561        )
3562        .unwrap();
3563    }
3564
3565    fn build_test_index(project_root: &Path, files: &[PathBuf]) -> SemanticIndex {
3566        let mut embed = test_vector_for_texts;
3567        SemanticIndex::build(project_root, files, &mut embed, 8).unwrap()
3568    }
3569
3570    fn test_project_root() -> PathBuf {
3571        std::env::current_dir().unwrap()
3572    }
3573
3574    fn set_file_metadata(index: &mut SemanticIndex, file: &Path, mtime: SystemTime, size: u64) {
3575        index.file_mtimes.insert(file.to_path_buf(), mtime);
3576        index.file_sizes.insert(file.to_path_buf(), size);
3577        index
3578            .file_hashes
3579            .insert(file.to_path_buf(), cache_freshness::zero_hash());
3580    }
3581
3582    fn legacy_semantic_index_bytes(index: &SemanticIndex) -> Vec<u8> {
3583        let mut buf = Vec::new();
3584        let fingerprint_bytes = index.fingerprint.as_ref().and_then(|fingerprint| {
3585            let encoded = fingerprint.as_string();
3586            if encoded.is_empty() {
3587                None
3588            } else {
3589                Some(encoded.into_bytes())
3590            }
3591        });
3592        let file_mtimes: Vec<_> = index
3593            .file_mtimes
3594            .iter()
3595            .filter_map(|(path, mtime)| {
3596                cache_relative_path(&index.project_root, path)
3597                    .map(|relative| (relative, path, mtime))
3598            })
3599            .collect();
3600        let entries: Vec<_> = index
3601            .entries
3602            .iter()
3603            .filter_map(|entry| {
3604                cache_relative_path(&index.project_root, &entry.chunk.file)
3605                    .map(|relative| (relative, entry))
3606            })
3607            .collect();
3608
3609        buf.push(SEMANTIC_INDEX_VERSION_V6);
3610        buf.extend_from_slice(&(index.dimension as u32).to_le_bytes());
3611        buf.extend_from_slice(&(entries.len() as u32).to_le_bytes());
3612        let fp_bytes_ref: &[u8] = fingerprint_bytes.as_deref().unwrap_or(&[]);
3613        buf.extend_from_slice(&(fp_bytes_ref.len() as u32).to_le_bytes());
3614        buf.extend_from_slice(fp_bytes_ref);
3615
3616        buf.extend_from_slice(&(file_mtimes.len() as u32).to_le_bytes());
3617        for (relative, path, mtime) in &file_mtimes {
3618            let path_bytes = relative.to_string_lossy().as_bytes().to_vec();
3619            buf.extend_from_slice(&(path_bytes.len() as u32).to_le_bytes());
3620            buf.extend_from_slice(&path_bytes);
3621            let duration = mtime
3622                .duration_since(SystemTime::UNIX_EPOCH)
3623                .unwrap_or_default();
3624            buf.extend_from_slice(&duration.as_secs().to_le_bytes());
3625            buf.extend_from_slice(&duration.subsec_nanos().to_le_bytes());
3626            let size = index.file_sizes.get(*path).copied().unwrap_or_default();
3627            buf.extend_from_slice(&size.to_le_bytes());
3628            let hash = index
3629                .file_hashes
3630                .get(*path)
3631                .copied()
3632                .unwrap_or_else(cache_freshness::zero_hash);
3633            buf.extend_from_slice(hash.as_bytes());
3634        }
3635
3636        for (relative, entry) in &entries {
3637            let c = &entry.chunk;
3638            let file_bytes = relative.to_string_lossy().as_bytes().to_vec();
3639            buf.extend_from_slice(&(file_bytes.len() as u32).to_le_bytes());
3640            buf.extend_from_slice(&file_bytes);
3641
3642            let name_bytes = c.name.as_bytes();
3643            buf.extend_from_slice(&(name_bytes.len() as u32).to_le_bytes());
3644            buf.extend_from_slice(name_bytes);
3645
3646            buf.push(symbol_kind_to_u8(&c.kind));
3647            buf.extend_from_slice(&(c.start_line as u32).to_le_bytes());
3648            buf.extend_from_slice(&(c.end_line as u32).to_le_bytes());
3649            buf.push(c.exported as u8);
3650
3651            let snippet_bytes = c.snippet.as_bytes();
3652            buf.extend_from_slice(&(snippet_bytes.len() as u32).to_le_bytes());
3653            buf.extend_from_slice(snippet_bytes);
3654
3655            let embed_bytes = c.embed_text.as_bytes();
3656            buf.extend_from_slice(&(embed_bytes.len() as u32).to_le_bytes());
3657            buf.extend_from_slice(embed_bytes);
3658
3659            for &val in &entry.vector {
3660                buf.extend_from_slice(&val.to_le_bytes());
3661            }
3662        }
3663
3664        buf
3665    }
3666
3667    #[derive(Default)]
3668    struct RecordingEmbedder {
3669        calls: Vec<Vec<String>>,
3670    }
3671
3672    impl RecordingEmbedder {
3673        fn embed(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
3674            let vectors = texts
3675                .iter()
3676                .map(|text| deterministic_test_vector(text))
3677                .collect();
3678            self.calls.push(texts);
3679            Ok(vectors)
3680        }
3681
3682        fn total_embedded_texts(&self) -> usize {
3683            self.calls.iter().map(Vec::len).sum()
3684        }
3685
3686        fn embedded_texts(&self) -> Vec<&str> {
3687            self.calls
3688                .iter()
3689                .flat_map(|batch| batch.iter().map(String::as_str))
3690                .collect()
3691        }
3692    }
3693
3694    fn deterministic_test_vector(text: &str) -> Vec<f32> {
3695        let hash = blake3::hash(text.as_bytes());
3696        let bytes = hash.as_bytes();
3697        vec![
3698            1.0,
3699            bytes[0] as f32 / 255.0,
3700            bytes[1] as f32 / 255.0,
3701            bytes[2] as f32 / 255.0,
3702        ]
3703    }
3704
3705    fn build_recorded_test_index(project_root: &Path, files: &[PathBuf]) -> SemanticIndex {
3706        let mut embedder = RecordingEmbedder::default();
3707        let mut embed = |texts: Vec<String>| embedder.embed(texts);
3708        SemanticIndex::build(project_root, files, &mut embed, 16).unwrap()
3709    }
3710
3711    fn force_stale(index: &mut SemanticIndex, file: &Path) {
3712        set_file_metadata(index, file, SystemTime::UNIX_EPOCH, 0);
3713    }
3714
3715    fn write_source(path: &Path, source: &str) {
3716        if let Some(parent) = path.parent() {
3717            fs::create_dir_all(parent).unwrap();
3718        }
3719        fs::write(path, source).unwrap();
3720    }
3721
3722    fn entries_for_file<'a>(index: &'a SemanticIndex, file: &Path) -> Vec<&'a EmbeddingEntry> {
3723        index
3724            .entries
3725            .iter()
3726            .filter(|entry| entry.chunk.file == file)
3727            .collect()
3728    }
3729
3730    fn entry_by_name<'a>(index: &'a SemanticIndex, file: &Path, name: &str) -> &'a EmbeddingEntry {
3731        index
3732            .entries
3733            .iter()
3734            .find(|entry| entry.chunk.file == file && entry.chunk.name == name)
3735            .unwrap_or_else(|| panic!("missing semantic entry {name} in {}", file.display()))
3736    }
3737
3738    fn file_summary_entry<'a>(index: &'a SemanticIndex, file: &Path) -> &'a EmbeddingEntry {
3739        index
3740            .entries
3741            .iter()
3742            .find(|entry| entry.chunk.file == file && entry.chunk.kind == SymbolKind::FileSummary)
3743            .unwrap_or_else(|| panic!("missing file-summary entry in {}", file.display()))
3744    }
3745
3746    #[test]
3747    fn refresh_stale_line_shift_reuses_all_chunks_and_retains_entries() {
3748        let temp = tempfile::tempdir().unwrap();
3749        let project_root = temp.path();
3750        let file = project_root.join("src/lib.rs");
3751        let original = "pub fn alpha() -> i32 {\n    1\n}\n\npub fn beta() -> i32 {\n    2\n}\n";
3752        write_source(&file, original);
3753
3754        let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
3755        let original_entry_count = index.entries.len();
3756        let original_alpha_vector = entry_by_name(&index, &file, "alpha").vector.clone();
3757
3758        write_source(&file, &format!("\n{original}"));
3759        force_stale(&mut index, &file);
3760
3761        let mut embedder = RecordingEmbedder::default();
3762        let mut embed = |texts: Vec<String>| embedder.embed(texts);
3763        let mut progress = |_done: usize, _total: usize| {};
3764        let summary = index
3765            .refresh_stale_files(
3766                project_root,
3767                std::slice::from_ref(&file),
3768                &mut embed,
3769                16,
3770                &mut progress,
3771            )
3772            .unwrap();
3773
3774        assert_eq!(summary.changed, 1);
3775        assert_eq!(embedder.total_embedded_texts(), 0);
3776        assert_eq!(index.entries.len(), original_entry_count);
3777        let shifted_alpha = entry_by_name(&index, &file, "alpha");
3778        assert_eq!(shifted_alpha.chunk.start_line, 1);
3779        assert_eq!(shifted_alpha.vector, original_alpha_vector);
3780    }
3781
3782    #[test]
3783    fn refresh_invalidated_line_shift_emits_full_replacement_delta_for_apply() {
3784        let temp = tempfile::tempdir().unwrap();
3785        let project_root = temp.path();
3786        let file = project_root.join("src/lib.rs");
3787        let original = "pub fn alpha() -> i32 {\n    1\n}\n\npub fn beta() -> i32 {\n    2\n}\n";
3788        write_source(&file, original);
3789
3790        let mut worker_index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
3791        let mut serving_index = worker_index.clone();
3792        let original_entry_count = worker_index.entries.len();
3793
3794        write_source(&file, &format!("\n{original}"));
3795
3796        let mut embedder = RecordingEmbedder::default();
3797        let mut embed = |texts: Vec<String>| embedder.embed(texts);
3798        let mut progress = |_done: usize, _total: usize| {};
3799        let update = worker_index
3800            .refresh_invalidated_files(
3801                project_root,
3802                std::slice::from_ref(&file),
3803                &mut embed,
3804                16,
3805                100,
3806                &mut progress,
3807            )
3808            .unwrap();
3809
3810        assert_eq!(embedder.total_embedded_texts(), 0);
3811        assert_eq!(update.added_entries.len(), original_entry_count);
3812        assert_eq!(worker_index.entries.len(), original_entry_count);
3813
3814        serving_index.apply_refresh_update(
3815            update.added_entries,
3816            update.updated_metadata,
3817            &update.completed_paths,
3818        );
3819
3820        assert_eq!(serving_index.entries.len(), original_entry_count);
3821        assert_eq!(
3822            entries_for_file(&serving_index, &file).len(),
3823            original_entry_count
3824        );
3825        assert_eq!(
3826            entry_by_name(&serving_index, &file, "alpha")
3827                .chunk
3828                .start_line,
3829            1
3830        );
3831    }
3832
3833    #[test]
3834    fn refresh_invalidated_one_symbol_edit_embeds_only_changed_symbol() {
3835        let temp = tempfile::tempdir().unwrap();
3836        let project_root = temp.path();
3837        let file = project_root.join("src/lib.rs");
3838        write_source(
3839            &file,
3840            "pub fn alpha() -> i32 {\n    1\n}\n\npub fn beta() -> i32 {\n    2\n}\n",
3841        );
3842
3843        let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
3844        let original_entry_count = index.entries.len();
3845        let beta_vector = entry_by_name(&index, &file, "beta").vector.clone();
3846
3847        write_source(
3848            &file,
3849            "pub fn alpha() -> i32 {\n    10\n}\n\npub fn beta() -> i32 {\n    2\n}\n",
3850        );
3851
3852        let mut embedder = RecordingEmbedder::default();
3853        let mut embed = |texts: Vec<String>| embedder.embed(texts);
3854        let mut progress = |_done: usize, _total: usize| {};
3855        let update = index
3856            .refresh_invalidated_files(
3857                project_root,
3858                std::slice::from_ref(&file),
3859                &mut embed,
3860                16,
3861                100,
3862                &mut progress,
3863            )
3864            .unwrap();
3865
3866        assert_eq!(embedder.total_embedded_texts(), 1);
3867        assert!(embedder.embedded_texts()[0].contains("name:alpha"));
3868        assert_eq!(update.added_entries.len(), original_entry_count);
3869        assert_eq!(entry_by_name(&index, &file, "beta").vector, beta_vector);
3870    }
3871
3872    #[test]
3873    fn refresh_reuses_one_old_vector_for_two_byte_identical_symbols() {
3874        let temp = tempfile::tempdir().unwrap();
3875        let project_root = temp.path();
3876        let file = project_root.join("src/dupe.js");
3877        let one_duplicate = "function duplicate() {\n  return 1;\n}\n";
3878        write_source(&file, one_duplicate);
3879
3880        let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
3881        let original_vector = entry_by_name(&index, &file, "duplicate").vector.clone();
3882
3883        write_source(&file, &format!("{one_duplicate}\n{one_duplicate}"));
3884
3885        let mut embedder = RecordingEmbedder::default();
3886        let mut embed = |texts: Vec<String>| embedder.embed(texts);
3887        let mut progress = |_done: usize, _total: usize| {};
3888        index
3889            .refresh_invalidated_files(
3890                project_root,
3891                std::slice::from_ref(&file),
3892                &mut embed,
3893                16,
3894                100,
3895                &mut progress,
3896            )
3897            .unwrap();
3898
3899        let duplicate_entries = index
3900            .entries
3901            .iter()
3902            .filter(|entry| entry.chunk.file == file && entry.chunk.name == "duplicate")
3903            .collect::<Vec<_>>();
3904        assert_eq!(duplicate_entries.len(), 2);
3905        assert_eq!(embedder.total_embedded_texts(), 0);
3906        assert_eq!(duplicate_entries[0].vector, original_vector);
3907        assert_eq!(duplicate_entries[1].vector, original_vector);
3908    }
3909
3910    #[test]
3911    fn file_summary_reuses_on_body_edit_and_misses_on_leading_doc_edit() {
3912        let temp = tempfile::tempdir().unwrap();
3913        let project_root = temp.path();
3914        let file = project_root.join("src/lib.rs");
3915        write_source(
3916            &file,
3917            "//! module docs v1\n\npub fn alpha() -> i32 {\n    1\n}\n",
3918        );
3919
3920        let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
3921        let summary_before = file_summary_entry(&index, &file).vector.clone();
3922
3923        write_source(
3924            &file,
3925            "//! module docs v1\n\npub fn alpha() -> i32 {\n    2\n}\n",
3926        );
3927        let mut body_embedder = RecordingEmbedder::default();
3928        let mut body_embed = |texts: Vec<String>| body_embedder.embed(texts);
3929        let mut progress = |_done: usize, _total: usize| {};
3930        index
3931            .refresh_invalidated_files(
3932                project_root,
3933                std::slice::from_ref(&file),
3934                &mut body_embed,
3935                16,
3936                100,
3937                &mut progress,
3938            )
3939            .unwrap();
3940        assert_eq!(body_embedder.total_embedded_texts(), 1);
3941        assert!(body_embedder.embedded_texts()[0].contains("name:alpha"));
3942        assert_eq!(file_summary_entry(&index, &file).vector, summary_before);
3943
3944        write_source(
3945            &file,
3946            "//! module docs v2\n\npub fn alpha() -> i32 {\n    2\n}\n",
3947        );
3948        let mut doc_embedder = RecordingEmbedder::default();
3949        let mut doc_embed = |texts: Vec<String>| doc_embedder.embed(texts);
3950        index
3951            .refresh_invalidated_files(
3952                project_root,
3953                std::slice::from_ref(&file),
3954                &mut doc_embed,
3955                16,
3956                100,
3957                &mut progress,
3958            )
3959            .unwrap();
3960
3961        assert_eq!(doc_embedder.total_embedded_texts(), 1);
3962        assert!(doc_embedder.embedded_texts()[0].contains("kind:file-summary"));
3963        assert_ne!(file_summary_entry(&index, &file).vector, summary_before);
3964    }
3965
3966    #[test]
3967    fn refresh_invalidated_deleted_file_drops_entries_without_embedding() {
3968        let temp = tempfile::tempdir().unwrap();
3969        let project_root = temp.path();
3970        let file = project_root.join("src/lib.rs");
3971        write_source(&file, "pub fn alpha() -> i32 {\n    1\n}\n");
3972
3973        let mut worker_index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
3974        let mut serving_index = worker_index.clone();
3975        fs::remove_file(&file).unwrap();
3976
3977        let mut embedder = RecordingEmbedder::default();
3978        let mut embed = |texts: Vec<String>| embedder.embed(texts);
3979        let mut progress = |_done: usize, _total: usize| {};
3980        let update = worker_index
3981            .refresh_invalidated_files(
3982                project_root,
3983                std::slice::from_ref(&file),
3984                &mut embed,
3985                16,
3986                100,
3987                &mut progress,
3988            )
3989            .unwrap();
3990
3991        assert_eq!(update.summary.deleted, 1);
3992        assert_eq!(embedder.total_embedded_texts(), 0);
3993        assert!(worker_index.entries.is_empty());
3994
3995        serving_index.apply_refresh_update(
3996            update.added_entries,
3997            update.updated_metadata,
3998            &update.completed_paths,
3999        );
4000        assert!(serving_index.entries.is_empty());
4001    }
4002
4003    #[test]
4004    fn watcher_collect_failure_does_not_resurrect_stale_entries() {
4005        let temp = tempfile::tempdir().unwrap();
4006        let project_root = temp.path();
4007        let file = project_root.join("src/lib.rs");
4008        write_source(&file, "pub fn alpha() -> i32 {\n    1\n}\n");
4009
4010        let mut worker_index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
4011        let mut serving_index = worker_index.clone();
4012        fs::write(&file, [0xff, 0xfe, 0xfd]).unwrap();
4013
4014        let mut embedder = RecordingEmbedder::default();
4015        let mut embed = |texts: Vec<String>| embedder.embed(texts);
4016        let mut progress = |_done: usize, _total: usize| {};
4017        let update = worker_index
4018            .refresh_invalidated_files(
4019                project_root,
4020                std::slice::from_ref(&file),
4021                &mut embed,
4022                16,
4023                100,
4024                &mut progress,
4025            )
4026            .unwrap();
4027
4028        assert_eq!(embedder.total_embedded_texts(), 0);
4029        assert!(update.added_entries.is_empty());
4030        assert!(worker_index.entries.is_empty());
4031        assert!(!worker_index.file_mtimes.contains_key(&file));
4032
4033        serving_index.apply_refresh_update(
4034            update.added_entries,
4035            update.updated_metadata,
4036            &update.completed_paths,
4037        );
4038        assert!(serving_index.entries.is_empty());
4039        assert!(!serving_index.file_mtimes.contains_key(&file));
4040    }
4041
4042    #[test]
4043    fn refresh_invalidated_cap_deferral_remains_file_count_based() {
4044        let temp = tempfile::tempdir().unwrap();
4045        let project_root = temp.path();
4046        let indexed = project_root.join("src/a.rs");
4047        let deferred = project_root.join("src/b.rs");
4048        write_source(&indexed, "pub fn alpha() -> i32 {\n    1\n}\n");
4049        write_source(&deferred, "pub fn beta() -> i32 {\n    2\n}\n");
4050
4051        let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&indexed));
4052        let mut embedder = RecordingEmbedder::default();
4053        let mut embed = |texts: Vec<String>| embedder.embed(texts);
4054        let mut progress = |_done: usize, _total: usize| {};
4055        let update = index
4056            .refresh_invalidated_files(
4057                project_root,
4058                std::slice::from_ref(&deferred),
4059                &mut embed,
4060                16,
4061                1,
4062                &mut progress,
4063            )
4064            .unwrap();
4065
4066        assert_eq!(update.summary.total_processed, 1);
4067        assert_eq!(update.summary.added, 0);
4068        assert_eq!(embedder.total_embedded_texts(), 0);
4069        assert_eq!(index.indexed_file_count(), 1);
4070        assert!(index.deferred_files.contains(&deferred));
4071        assert!(entries_for_file(&index, &deferred).is_empty());
4072    }
4073
4074    #[test]
4075    fn semantic_cache_serialization_skips_paths_outside_project_root() {
4076        let dir = tempfile::tempdir().expect("create temp dir");
4077        let project = fs::canonicalize(dir.path()).expect("canonical project");
4078        let outside = project.join("..").join("outside.rs");
4079        let mut index = SemanticIndex::new(project.clone(), 3);
4080        index
4081            .file_mtimes
4082            .insert(outside.clone(), SystemTime::UNIX_EPOCH);
4083        index.file_sizes.insert(outside.clone(), 1);
4084        index
4085            .file_hashes
4086            .insert(outside.clone(), cache_freshness::zero_hash());
4087        index.entries.push(EmbeddingEntry {
4088            chunk: SemanticChunk {
4089                file: outside,
4090                name: "outside".to_string(),
4091                kind: SymbolKind::Function,
4092                start_line: 0,
4093                end_line: 0,
4094                exported: false,
4095                embed_text: "outside".to_string(),
4096                snippet: "outside".to_string(),
4097            },
4098            vector: vec![1.0, 0.0, 0.0],
4099        });
4100
4101        let bytes = index.to_bytes();
4102        let loaded = SemanticIndex::from_bytes(&bytes, &project).expect("load serialized index");
4103        assert_eq!(loaded.entries.len(), 0);
4104        assert!(loaded.file_mtimes.is_empty());
4105    }
4106
4107    #[test]
4108    fn semantic_search_bounded_top_k_matches_reference_full_sort() {
4109        let project_root = test_project_root();
4110        let file = project_root.join("src/lib.rs");
4111        let mut index = SemanticIndex::new(project_root, 2);
4112        let entries = [
4113            ("alpha", vec![1.0, 0.0], false),
4114            ("beta", vec![0.0, 1.0], false),
4115            ("gamma", vec![1.0, 0.0], false),
4116            ("delta", vec![0.5, 0.5], true),
4117            ("epsilon", vec![-1.0, 0.0], false),
4118        ];
4119        for (line, (name, vector, exported)) in entries.into_iter().enumerate() {
4120            index.entries.push(EmbeddingEntry {
4121                chunk: SemanticChunk {
4122                    file: file.clone(),
4123                    name: name.to_string(),
4124                    kind: SymbolKind::Function,
4125                    start_line: line as u32 + 1,
4126                    end_line: line as u32 + 1,
4127                    exported,
4128                    embed_text: name.to_string(),
4129                    snippet: format!("fn {name}() {{}}"),
4130                },
4131                vector,
4132            });
4133        }
4134
4135        let query = vec![1.0, 0.0];
4136        let top_k = 4;
4137        let mut reference: Vec<(f32, usize)> = index
4138            .entries
4139            .iter()
4140            .enumerate()
4141            .map(|(idx, entry)| {
4142                let mut score = cosine_similarity(&query, &entry.vector);
4143                if entry.chunk.exported {
4144                    score *= 1.1;
4145                }
4146                (score, idx)
4147            })
4148            .collect();
4149        reference.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
4150        let expected: Vec<(String, f32)> = reference
4151            .into_iter()
4152            .take(top_k)
4153            .map(|(score, idx)| (index.entries[idx].chunk.name.clone(), score))
4154            .collect();
4155
4156        let actual: Vec<(String, f32)> = index
4157            .search(&query, top_k)
4158            .into_iter()
4159            .map(|result| (result.name, result.score))
4160            .collect();
4161
4162        assert_eq!(
4163            actual.iter().map(|(name, _)| name).collect::<Vec<_>>(),
4164            expected.iter().map(|(name, _)| name).collect::<Vec<_>>()
4165        );
4166        for ((_, actual_score), (_, expected_score)) in actual.iter().zip(expected.iter()) {
4167            assert!((actual_score - expected_score).abs() < 1e-6);
4168        }
4169        assert_eq!(actual[0].0, "alpha");
4170        assert_eq!(actual[1].0, "gamma", "equal scores keep insertion order");
4171        assert!(index.search(&query, 0).is_empty());
4172    }
4173
4174    #[test]
4175    fn test_cosine_similarity_identical() {
4176        let a = vec![1.0, 0.0, 0.0];
4177        let b = vec![1.0, 0.0, 0.0];
4178        assert!((cosine_similarity(&a, &b) - 1.0).abs() < 0.001);
4179    }
4180
4181    #[test]
4182    fn test_cosine_similarity_orthogonal() {
4183        let a = vec![1.0, 0.0, 0.0];
4184        let b = vec![0.0, 1.0, 0.0];
4185        assert!(cosine_similarity(&a, &b).abs() < 0.001);
4186    }
4187
4188    #[test]
4189    fn test_cosine_similarity_opposite() {
4190        let a = vec![1.0, 0.0, 0.0];
4191        let b = vec![-1.0, 0.0, 0.0];
4192        assert!((cosine_similarity(&a, &b) + 1.0).abs() < 0.001);
4193    }
4194
4195    #[test]
4196    fn test_serialization_roundtrip() {
4197        let project_root = test_project_root();
4198        let file = project_root.join("src/main.rs");
4199        let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
4200        index.entries.push(EmbeddingEntry {
4201            chunk: SemanticChunk {
4202                file: file.clone(),
4203                name: "handle_request".to_string(),
4204                kind: SymbolKind::Function,
4205                start_line: 10,
4206                end_line: 25,
4207                exported: true,
4208                embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
4209                snippet: "fn handle_request() {\n  // ...\n}".to_string(),
4210            },
4211            vector: vec![0.1, 0.2, 0.3, 0.4],
4212        });
4213        index.dimension = 4;
4214        index
4215            .file_mtimes
4216            .insert(file.clone(), SystemTime::UNIX_EPOCH);
4217        index.file_sizes.insert(file, 0);
4218        index.set_fingerprint(SemanticIndexFingerprint {
4219            backend: "fastembed".to_string(),
4220            model: "all-MiniLM-L6-v2".to_string(),
4221            base_url: FALLBACK_BACKEND.to_string(),
4222            dimension: 4,
4223            chunking_version: default_chunking_version(),
4224        });
4225
4226        let bytes = index.to_bytes();
4227        let restored = SemanticIndex::from_bytes(&bytes, &project_root).unwrap();
4228
4229        assert_eq!(restored.entries.len(), 1);
4230        assert_eq!(restored.entries[0].chunk.name, "handle_request");
4231        assert_eq!(restored.entries[0].vector, vec![0.1, 0.2, 0.3, 0.4]);
4232        assert_eq!(restored.dimension, 4);
4233        assert_eq!(restored.backend_label(), Some("fastembed"));
4234        assert_eq!(restored.model_label(), Some("all-MiniLM-L6-v2"));
4235    }
4236
4237    #[test]
4238    fn semantic_cache_streaming_persistence_matches_legacy_bytes_and_round_trips() {
4239        let storage = tempfile::tempdir().expect("create storage dir");
4240        let project = storage.path().join("project");
4241        fs::create_dir_all(project.join("src")).expect("create project src");
4242        let file = project.join("src/lib.rs");
4243        fs::write(&file, "pub fn alpha() {}\npub fn beta() {}\n").expect("write source");
4244        let project_root = fs::canonicalize(&project).expect("canonical project");
4245        let file = fs::canonicalize(&file).expect("canonical file");
4246
4247        let mut index = SemanticIndex::new(project_root.clone(), 3);
4248        let mtime = SystemTime::UNIX_EPOCH + Duration::new(123, 456);
4249        index.file_mtimes.insert(file.clone(), mtime);
4250        index.file_sizes.insert(file.clone(), 42);
4251        index
4252            .file_hashes
4253            .insert(file.clone(), cache_freshness::zero_hash());
4254        index.entries.push(EmbeddingEntry {
4255            chunk: SemanticChunk {
4256                file: file.clone(),
4257                name: "alpha".to_string(),
4258                kind: SymbolKind::Function,
4259                start_line: 0,
4260                end_line: 0,
4261                exported: true,
4262                embed_text: "file:src/lib.rs kind:function name:alpha".to_string(),
4263                snippet: "pub fn alpha() {}".to_string(),
4264            },
4265            vector: vec![0.1, 0.2, 0.3],
4266        });
4267        index.entries.push(EmbeddingEntry {
4268            chunk: SemanticChunk {
4269                file: file.clone(),
4270                name: "beta".to_string(),
4271                kind: SymbolKind::Function,
4272                start_line: 1,
4273                end_line: 1,
4274                exported: true,
4275                embed_text: "file:src/lib.rs kind:function name:beta".to_string(),
4276                snippet: "pub fn beta() {}".to_string(),
4277            },
4278            vector: vec![0.4, 0.5, 0.6],
4279        });
4280        let fingerprint = SemanticIndexFingerprint {
4281            backend: "fastembed".to_string(),
4282            model: "all-MiniLM-L6-v2".to_string(),
4283            base_url: FALLBACK_BACKEND.to_string(),
4284            dimension: 3,
4285            chunking_version: default_chunking_version(),
4286        };
4287        index.set_fingerprint(fingerprint.clone());
4288
4289        let legacy_bytes = legacy_semantic_index_bytes(&index);
4290        assert_eq!(index.to_bytes(), legacy_bytes);
4291
4292        index.write_to_disk(storage.path(), "proj");
4293        let data_path = storage.path().join("semantic/proj/semantic.bin");
4294        assert_eq!(
4295            fs::read(&data_path).expect("read semantic.bin"),
4296            legacy_bytes
4297        );
4298
4299        let loaded = SemanticIndex::read_from_disk(
4300            storage.path(),
4301            "proj",
4302            &project_root,
4303            false,
4304            Some(&fingerprint.as_string()),
4305        )
4306        .expect("load semantic index");
4307        assert_eq!(loaded.entries.len(), index.entries.len());
4308        assert_eq!(loaded.dimension, index.dimension);
4309        assert_eq!(
4310            loaded.fingerprint().unwrap().as_string(),
4311            fingerprint.as_string()
4312        );
4313        assert_eq!(loaded.file_mtimes.get(&file), Some(&mtime));
4314        assert_eq!(loaded.file_sizes.get(&file), Some(&42));
4315        assert_eq!(
4316            loaded.file_hashes.get(&file),
4317            Some(&cache_freshness::zero_hash())
4318        );
4319        for (actual, expected) in loaded.entries.iter().zip(index.entries.iter()) {
4320            assert_eq!(actual.chunk.file, expected.chunk.file);
4321            assert_eq!(actual.chunk.name, expected.chunk.name);
4322            assert_eq!(actual.chunk.kind, expected.chunk.kind);
4323            assert_eq!(actual.chunk.start_line, expected.chunk.start_line);
4324            assert_eq!(actual.chunk.end_line, expected.chunk.end_line);
4325            assert_eq!(actual.chunk.exported, expected.chunk.exported);
4326            assert_eq!(actual.chunk.embed_text, expected.chunk.embed_text);
4327            assert_eq!(actual.chunk.snippet, expected.chunk.snippet);
4328            assert_eq!(actual.vector, expected.vector);
4329        }
4330        assert_eq!(loaded.to_bytes(), legacy_bytes);
4331    }
4332
4333    #[test]
4334    fn symbol_kind_serialization_roundtrip_includes_file_summary_variant() {
4335        let cases = [
4336            (SymbolKind::Function, 0),
4337            (SymbolKind::Class, 1),
4338            (SymbolKind::Method, 2),
4339            (SymbolKind::Struct, 3),
4340            (SymbolKind::Interface, 4),
4341            (SymbolKind::Enum, 5),
4342            (SymbolKind::TypeAlias, 6),
4343            (SymbolKind::Variable, 7),
4344            (SymbolKind::Heading, 8),
4345            (SymbolKind::FileSummary, 9),
4346        ];
4347
4348        for (kind, encoded) in cases {
4349            assert_eq!(symbol_kind_to_u8(&kind), encoded);
4350            assert_eq!(u8_to_symbol_kind(encoded), kind);
4351        }
4352    }
4353
4354    #[test]
4355    fn test_search_top_k() {
4356        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4357        index.dimension = 3;
4358
4359        // Add entries with known vectors
4360        for (i, name) in ["auth", "database", "handler"].iter().enumerate() {
4361            let mut vec = vec![0.0f32; 3];
4362            vec[i] = 1.0; // orthogonal vectors
4363            index.entries.push(EmbeddingEntry {
4364                chunk: SemanticChunk {
4365                    file: PathBuf::from("/src/lib.rs"),
4366                    name: name.to_string(),
4367                    kind: SymbolKind::Function,
4368                    start_line: (i * 10 + 1) as u32,
4369                    end_line: (i * 10 + 5) as u32,
4370                    exported: true,
4371                    embed_text: format!("kind:function name:{}", name),
4372                    snippet: format!("fn {}() {{}}", name),
4373                },
4374                vector: vec,
4375            });
4376        }
4377
4378        // Query aligned with "auth" (index 0)
4379        let query = vec![0.9, 0.1, 0.0];
4380        let results = index.search(&query, 2);
4381
4382        assert_eq!(results.len(), 2);
4383        assert_eq!(results[0].name, "auth"); // highest score
4384        assert!(results[0].score > results[1].score);
4385    }
4386
4387    #[test]
4388    fn test_empty_index_search() {
4389        let index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4390        let results = index.search(&[0.1, 0.2, 0.3], 10);
4391        assert!(results.is_empty());
4392    }
4393
4394    #[test]
4395    fn single_line_symbol_builds_non_empty_snippet() {
4396        let symbol = Symbol {
4397            name: "answer".to_string(),
4398            kind: SymbolKind::Variable,
4399            range: crate::symbols::Range {
4400                start_line: 0,
4401                start_col: 0,
4402                end_line: 0,
4403                end_col: 24,
4404            },
4405            signature: Some("const answer = 42".to_string()),
4406            scope_chain: Vec::new(),
4407            exported: true,
4408            parent: None,
4409        };
4410        let source = "export const answer = 42;\n";
4411
4412        let snippet = build_snippet(&symbol, source);
4413
4414        assert_eq!(snippet, "export const answer = 42;");
4415    }
4416
4417    #[test]
4418    fn optimized_file_chunk_collection_matches_file_parser_path() {
4419        let project_root = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
4420        let file = project_root.join("src/semantic_index.rs");
4421        let source = std::fs::read_to_string(&file).unwrap();
4422
4423        let mut legacy_parser = FileParser::new();
4424        let legacy_symbols = legacy_parser.extract_symbols(&file).unwrap();
4425        let legacy_chunks = symbols_to_chunks(&file, &legacy_symbols, &source, &project_root);
4426
4427        let mut parsers = HashMap::new();
4428        let optimized_chunks = collect_file_chunks(&project_root, &file, &mut parsers).unwrap();
4429
4430        assert_eq!(
4431            chunk_fingerprint(&optimized_chunks),
4432            chunk_fingerprint(&legacy_chunks)
4433        );
4434    }
4435
4436    fn chunk_fingerprint(
4437        chunks: &[SemanticChunk],
4438    ) -> Vec<(String, SymbolKind, u32, u32, bool, String, String)> {
4439        chunks
4440            .iter()
4441            .map(|chunk| {
4442                (
4443                    chunk.name.clone(),
4444                    chunk.kind.clone(),
4445                    chunk.start_line,
4446                    chunk.end_line,
4447                    chunk.exported,
4448                    chunk.embed_text.clone(),
4449                    chunk.snippet.clone(),
4450                )
4451            })
4452            .collect()
4453    }
4454
4455    #[test]
4456    fn collect_file_chunks_skips_oversized_file() {
4457        let dir = tempfile::tempdir().unwrap();
4458        let big = dir.path().join("huge.ts");
4459        // Just over the cap: a valid TS file that would otherwise yield chunks.
4460        let filler = "export const x = 1;\n"
4461            .repeat(((MAX_SEMANTIC_FILE_BYTES as usize) / "export const x = 1;\n".len()) + 16);
4462        std::fs::write(&big, &filler).unwrap();
4463        assert!(big.metadata().unwrap().len() > MAX_SEMANTIC_FILE_BYTES);
4464
4465        let mut parsers = HashMap::new();
4466        // Oversized → tracked with zero chunks, NOT an error (so the caller keeps
4467        // the file in metadata and freshness skips re-reading it).
4468        let chunks = collect_file_chunks(dir.path(), &big, &mut parsers).unwrap();
4469        assert!(chunks.is_empty(), "oversized file must yield no chunks");
4470
4471        // A small file of the same language still produces chunks.
4472        let small = dir.path().join("small.ts");
4473        std::fs::write(&small, "export function foo() { return 1; }\n").unwrap();
4474        let small_chunks = collect_file_chunks(dir.path(), &small, &mut parsers).unwrap();
4475        assert!(!small_chunks.is_empty(), "small file should still chunk");
4476    }
4477
4478    #[test]
4479    fn rejects_oversized_dimension_during_deserialization() {
4480        let mut bytes = Vec::new();
4481        bytes.push(1u8);
4482        bytes.extend_from_slice(&((MAX_DIMENSION as u32) + 1).to_le_bytes());
4483        bytes.extend_from_slice(&0u32.to_le_bytes());
4484        bytes.extend_from_slice(&0u32.to_le_bytes());
4485
4486        assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
4487    }
4488
4489    #[test]
4490    fn rejects_oversized_entry_count_during_deserialization() {
4491        let mut bytes = Vec::new();
4492        bytes.push(1u8);
4493        bytes.extend_from_slice(&(DEFAULT_DIMENSION as u32).to_le_bytes());
4494        bytes.extend_from_slice(&((MAX_ENTRIES as u32) + 1).to_le_bytes());
4495        bytes.extend_from_slice(&0u32.to_le_bytes());
4496
4497        assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
4498    }
4499
4500    #[test]
4501    fn invalidate_file_removes_entries_and_mtime() {
4502        let target = PathBuf::from("/src/main.rs");
4503        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4504        index.entries.push(EmbeddingEntry {
4505            chunk: SemanticChunk {
4506                file: target.clone(),
4507                name: "main".to_string(),
4508                kind: SymbolKind::Function,
4509                start_line: 0,
4510                end_line: 1,
4511                exported: false,
4512                embed_text: "main".to_string(),
4513                snippet: "fn main() {}".to_string(),
4514            },
4515            vector: vec![1.0; DEFAULT_DIMENSION],
4516        });
4517        index
4518            .file_mtimes
4519            .insert(target.clone(), SystemTime::UNIX_EPOCH);
4520        index.file_sizes.insert(target.clone(), 0);
4521
4522        index.invalidate_file(&target);
4523
4524        assert!(index.entries.is_empty());
4525        assert!(!index.file_mtimes.contains_key(&target));
4526        assert!(!index.file_sizes.contains_key(&target));
4527    }
4528
4529    #[test]
4530    fn refresh_missing_changed_file_is_purged_after_collect() {
4531        let temp = tempfile::tempdir().unwrap();
4532        let project_root = temp.path();
4533        let file = project_root.join("src/lib.rs");
4534        fs::create_dir_all(file.parent().unwrap()).unwrap();
4535        write_rust_file(&file, "vanished_symbol");
4536
4537        let mut index = build_test_index(project_root, std::slice::from_ref(&file));
4538        let original_size = *index.file_sizes.get(&file).unwrap();
4539        set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, original_size + 1);
4540        fs::remove_file(&file).unwrap();
4541
4542        let mut embed = test_vector_for_texts;
4543        let mut progress = |_done: usize, _total: usize| {};
4544        let summary = index
4545            .refresh_stale_files(
4546                project_root,
4547                std::slice::from_ref(&file),
4548                &mut embed,
4549                8,
4550                &mut progress,
4551            )
4552            .unwrap();
4553
4554        assert_eq!(summary.changed, 0);
4555        assert_eq!(summary.added, 0);
4556        assert_eq!(summary.deleted, 1);
4557        assert!(index.entries.is_empty());
4558        assert!(!index.file_mtimes.contains_key(&file));
4559        assert!(!index.file_sizes.contains_key(&file));
4560        assert!(!index.file_hashes.contains_key(&file));
4561    }
4562
4563    #[test]
4564    fn refresh_collect_error_for_existing_path_preserves_cached_entry() {
4565        let temp = tempfile::tempdir().unwrap();
4566        let project_root = temp.path();
4567        let file = project_root.join("src/lib.rs");
4568        fs::create_dir_all(file.parent().unwrap()).unwrap();
4569        write_rust_file(&file, "kept_symbol");
4570
4571        let mut index = build_test_index(project_root, std::slice::from_ref(&file));
4572        let original_entry_count = index.entries.len();
4573        let original_mtime = *index.file_mtimes.get(&file).unwrap();
4574        let original_size = *index.file_sizes.get(&file).unwrap();
4575
4576        let stale_mtime = SystemTime::UNIX_EPOCH;
4577        set_file_metadata(&mut index, &file, stale_mtime, original_size + 1);
4578        fs::remove_file(&file).unwrap();
4579        fs::create_dir(&file).unwrap();
4580
4581        let mut embed = test_vector_for_texts;
4582        let mut progress = |_done: usize, _total: usize| {};
4583        let summary = index
4584            .refresh_stale_files(
4585                project_root,
4586                std::slice::from_ref(&file),
4587                &mut embed,
4588                8,
4589                &mut progress,
4590            )
4591            .unwrap();
4592
4593        assert_eq!(summary.changed, 0);
4594        assert_eq!(summary.added, 0);
4595        assert_eq!(summary.deleted, 0);
4596        assert_eq!(index.entries.len(), original_entry_count);
4597        assert!(index
4598            .entries
4599            .iter()
4600            .any(|entry| entry.chunk.name == "kept_symbol"));
4601        assert_eq!(index.file_mtimes.get(&file), Some(&stale_mtime));
4602        assert_ne!(index.file_mtimes.get(&file), Some(&original_mtime));
4603        assert_eq!(index.file_sizes.get(&file), Some(&(original_size + 1)));
4604    }
4605
4606    #[test]
4607    fn refresh_never_indexed_file_error_does_not_record_mtime() {
4608        let temp = tempfile::tempdir().unwrap();
4609        let project_root = temp.path();
4610        let missing = project_root.join("src/missing.rs");
4611        fs::create_dir_all(missing.parent().unwrap()).unwrap();
4612
4613        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4614        let mut embed = test_vector_for_texts;
4615        let mut progress = |_done: usize, _total: usize| {};
4616        let summary = index
4617            .refresh_stale_files(
4618                project_root,
4619                std::slice::from_ref(&missing),
4620                &mut embed,
4621                8,
4622                &mut progress,
4623            )
4624            .unwrap();
4625
4626        assert_eq!(summary.added, 0);
4627        assert_eq!(summary.changed, 0);
4628        assert_eq!(summary.deleted, 0);
4629        assert!(!index.file_mtimes.contains_key(&missing));
4630        assert!(!index.file_sizes.contains_key(&missing));
4631        assert!(index.entries.is_empty());
4632    }
4633
4634    #[test]
4635    fn refresh_reports_added_for_new_files() {
4636        let temp = tempfile::tempdir().unwrap();
4637        let project_root = temp.path();
4638        let existing = project_root.join("src/lib.rs");
4639        let added = project_root.join("src/new.rs");
4640        fs::create_dir_all(existing.parent().unwrap()).unwrap();
4641        write_rust_file(&existing, "existing_symbol");
4642        write_rust_file(&added, "added_symbol");
4643
4644        let mut index = build_test_index(project_root, std::slice::from_ref(&existing));
4645        let mut embed = test_vector_for_texts;
4646        let mut progress = |_done: usize, _total: usize| {};
4647        let summary = index
4648            .refresh_stale_files(
4649                project_root,
4650                &[existing.clone(), added.clone()],
4651                &mut embed,
4652                8,
4653                &mut progress,
4654            )
4655            .unwrap();
4656
4657        assert_eq!(summary.added, 1);
4658        assert_eq!(summary.changed, 0);
4659        assert_eq!(summary.deleted, 0);
4660        assert_eq!(summary.total_processed, 2);
4661        assert!(index.file_mtimes.contains_key(&added));
4662        assert!(index.entries.iter().any(|entry| entry.chunk.file == added));
4663    }
4664
4665    #[test]
4666    fn refresh_reports_deleted_for_removed_files() {
4667        let temp = tempfile::tempdir().unwrap();
4668        let project_root = temp.path();
4669        let deleted = project_root.join("src/deleted.rs");
4670        fs::create_dir_all(deleted.parent().unwrap()).unwrap();
4671        write_rust_file(&deleted, "deleted_symbol");
4672
4673        let mut index = build_test_index(project_root, std::slice::from_ref(&deleted));
4674        fs::remove_file(&deleted).unwrap();
4675
4676        let mut embed = test_vector_for_texts;
4677        let mut progress = |_done: usize, _total: usize| {};
4678        let summary = index
4679            .refresh_stale_files(project_root, &[], &mut embed, 8, &mut progress)
4680            .unwrap();
4681
4682        assert_eq!(summary.deleted, 1);
4683        assert_eq!(summary.changed, 0);
4684        assert_eq!(summary.added, 0);
4685        assert_eq!(summary.total_processed, 1);
4686        assert!(!index.file_mtimes.contains_key(&deleted));
4687        assert!(index.entries.is_empty());
4688    }
4689
4690    #[test]
4691    fn refresh_reports_changed_for_modified_files() {
4692        let temp = tempfile::tempdir().unwrap();
4693        let project_root = temp.path();
4694        let file = project_root.join("src/lib.rs");
4695        fs::create_dir_all(file.parent().unwrap()).unwrap();
4696        write_rust_file(&file, "old_symbol");
4697
4698        let mut index = build_test_index(project_root, std::slice::from_ref(&file));
4699        set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, 0);
4700        write_rust_file(&file, "new_symbol");
4701
4702        let mut embed = test_vector_for_texts;
4703        let mut progress = |_done: usize, _total: usize| {};
4704        let summary = index
4705            .refresh_stale_files(
4706                project_root,
4707                std::slice::from_ref(&file),
4708                &mut embed,
4709                8,
4710                &mut progress,
4711            )
4712            .unwrap();
4713
4714        assert_eq!(summary.changed, 1);
4715        assert_eq!(summary.added, 0);
4716        assert_eq!(summary.deleted, 0);
4717        assert_eq!(summary.total_processed, 1);
4718        assert!(index
4719            .entries
4720            .iter()
4721            .any(|entry| entry.chunk.name == "new_symbol"));
4722        assert!(!index
4723            .entries
4724            .iter()
4725            .any(|entry| entry.chunk.name == "old_symbol"));
4726    }
4727
4728    #[test]
4729    fn refresh_all_clean_reports_zero_counts_and_no_embedding_work() {
4730        let temp = tempfile::tempdir().unwrap();
4731        let project_root = temp.path();
4732        let file = project_root.join("src/lib.rs");
4733        fs::create_dir_all(file.parent().unwrap()).unwrap();
4734        write_rust_file(&file, "clean_symbol");
4735
4736        let mut index = build_test_index(project_root, std::slice::from_ref(&file));
4737        let original_entries = index.entries.len();
4738        let mut embed_called = false;
4739        let mut embed = |texts: Vec<String>| {
4740            embed_called = true;
4741            test_vector_for_texts(texts)
4742        };
4743        let mut progress = |_done: usize, _total: usize| {};
4744        let summary = index
4745            .refresh_stale_files(
4746                project_root,
4747                std::slice::from_ref(&file),
4748                &mut embed,
4749                8,
4750                &mut progress,
4751            )
4752            .unwrap();
4753
4754        assert!(summary.is_noop());
4755        assert_eq!(summary.total_processed, 1);
4756        assert!(!embed_called);
4757        assert_eq!(index.entries.len(), original_entries);
4758    }
4759
4760    #[test]
4761    fn detects_missing_onnx_runtime_from_dynamic_load_error() {
4762        let message = "Failed to load ONNX Runtime shared library libonnxruntime.dylib via dlopen: no such file";
4763
4764        assert!(is_onnx_runtime_unavailable(message));
4765    }
4766
4767    #[test]
4768    fn formats_missing_onnx_runtime_with_install_hint() {
4769        let message = format_embedding_init_error(
4770            "Failed to load ONNX Runtime shared library libonnxruntime.so via dlopen: no such file",
4771        );
4772
4773        assert!(message.starts_with("ONNX Runtime not found. Install via:"));
4774        assert!(message.contains("Original error:"));
4775    }
4776
4777    #[test]
4778    fn openai_compatible_backend_embeds_with_mock_server() {
4779        let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
4780            assert!(request_line.starts_with("POST "));
4781            assert_eq!(path, "/v1/embeddings");
4782            "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0},{\"embedding\":[0.4,0.5,0.6],\"index\":1}]}".to_string()
4783        });
4784
4785        let config = SemanticBackendConfig {
4786            backend: SemanticBackend::OpenAiCompatible,
4787            model: "test-embedding".to_string(),
4788            base_url: Some(base_url),
4789            api_key_env: None,
4790            timeout_ms: 5_000,
4791            max_batch_size: 64,
4792            max_files: 20_000,
4793        };
4794
4795        let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
4796        let vectors = model
4797            .embed(vec!["hello".to_string(), "world".to_string()])
4798            .unwrap();
4799
4800        assert_eq!(vectors, vec![vec![0.1, 0.2, 0.3], vec![0.4, 0.5, 0.6]]);
4801        handle.join().unwrap();
4802    }
4803
4804    /// Regression for issue #36: AFT was sending TWO Content-Type headers
4805    /// on the OpenAI embeddings request — once implicitly via `.json(&body)`
4806    /// and again explicitly via `.header("Content-Type", "application/json")`.
4807    /// reqwest's `.header()` calls `HeaderMap::append`, which produces two
4808    /// headers on the wire. OpenAI's /v1/embeddings endpoint rejects that
4809    /// with `HTTP 400 "you must provide a model parameter"` even though the
4810    /// body actually contains `model`. The fix is to drop the explicit
4811    /// `.header("Content-Type", ...)` call. This test pins that we send
4812    /// exactly one Content-Type header.
4813    #[test]
4814    fn openai_compatible_request_has_single_content_type_header() {
4815        use std::sync::{Arc, Mutex};
4816        let captured: Arc<Mutex<Vec<u8>>> = Arc::new(Mutex::new(Vec::new()));
4817        let captured_for_thread = Arc::clone(&captured);
4818
4819        let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
4820        let addr = listener.local_addr().expect("local addr");
4821        let handle = thread::spawn(move || {
4822            let (mut stream, _) = listener.accept().expect("accept");
4823            let mut buf = Vec::new();
4824            let mut chunk = [0u8; 4096];
4825            let mut header_end = None;
4826            let mut content_length = 0usize;
4827            loop {
4828                let n = stream.read(&mut chunk).expect("read");
4829                if n == 0 {
4830                    break;
4831                }
4832                buf.extend_from_slice(&chunk[..n]);
4833                if header_end.is_none() {
4834                    if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
4835                        header_end = Some(pos + 4);
4836                        for line in String::from_utf8_lossy(&buf[..pos + 4]).lines() {
4837                            if let Some(value) = line.strip_prefix("Content-Length:") {
4838                                content_length = value.trim().parse::<usize>().unwrap_or(0);
4839                            }
4840                        }
4841                    }
4842                }
4843                if let Some(end) = header_end {
4844                    if buf.len() >= end + content_length {
4845                        break;
4846                    }
4847                }
4848            }
4849            *captured_for_thread.lock().unwrap() = buf;
4850            let body = "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0}]}";
4851            let response = format!(
4852                "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
4853                body.len(),
4854                body
4855            );
4856            let _ = stream.write_all(response.as_bytes());
4857        });
4858
4859        let config = SemanticBackendConfig {
4860            backend: SemanticBackend::OpenAiCompatible,
4861            model: "text-embedding-3-small".to_string(),
4862            base_url: Some(format!("http://{}", addr)),
4863            api_key_env: None,
4864            timeout_ms: 5_000,
4865            max_batch_size: 64,
4866            max_files: 20_000,
4867        };
4868        let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
4869        let _ = model.embed(vec!["probe".to_string()]).unwrap();
4870        handle.join().unwrap();
4871
4872        let bytes = captured.lock().unwrap().clone();
4873        let request = String::from_utf8_lossy(&bytes);
4874
4875        // Lowercase line counts because HTTP headers are case-insensitive
4876        // and reqwest may emit `content-type` in lowercase under HTTP/2.
4877        let content_type_lines = request
4878            .lines()
4879            .filter(|line| {
4880                let lower = line.to_ascii_lowercase();
4881                lower.starts_with("content-type:")
4882            })
4883            .count();
4884        assert_eq!(
4885            content_type_lines, 1,
4886            "expected exactly one Content-Type header but found {content_type_lines}; full request:\n{request}",
4887        );
4888
4889        // The body must still include the model field — pin this so a future
4890        // change can't accidentally drop `model` while fixing duplicate headers.
4891        assert!(
4892            request.contains(r#""model":"text-embedding-3-small""#),
4893            "request body should contain model field; full request:\n{request}",
4894        );
4895    }
4896
4897    #[test]
4898    fn ollama_backend_embeds_with_mock_server() {
4899        let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
4900            assert!(request_line.starts_with("POST "));
4901            assert_eq!(path, "/api/embed");
4902            "{\"embeddings\":[[0.7,0.8,0.9],[1.0,1.1,1.2]]}".to_string()
4903        });
4904
4905        let config = SemanticBackendConfig {
4906            backend: SemanticBackend::Ollama,
4907            model: "embeddinggemma".to_string(),
4908            base_url: Some(base_url),
4909            api_key_env: None,
4910            timeout_ms: 5_000,
4911            max_batch_size: 64,
4912            max_files: 20_000,
4913        };
4914
4915        let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
4916        let vectors = model
4917            .embed(vec!["hello".to_string(), "world".to_string()])
4918            .unwrap();
4919
4920        assert_eq!(vectors, vec![vec![0.7, 0.8, 0.9], vec![1.0, 1.1, 1.2]]);
4921        handle.join().unwrap();
4922    }
4923
4924    #[test]
4925    fn read_from_disk_rejects_fingerprint_mismatch() {
4926        let storage = tempfile::tempdir().unwrap();
4927        let project_key = "proj";
4928
4929        let project_root = test_project_root();
4930        let file = project_root.join("src/main.rs");
4931        let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
4932        index.entries.push(EmbeddingEntry {
4933            chunk: SemanticChunk {
4934                file: file.clone(),
4935                name: "handle_request".to_string(),
4936                kind: SymbolKind::Function,
4937                start_line: 10,
4938                end_line: 25,
4939                exported: true,
4940                embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
4941                snippet: "fn handle_request() {}".to_string(),
4942            },
4943            vector: vec![0.1, 0.2, 0.3],
4944        });
4945        index.dimension = 3;
4946        index
4947            .file_mtimes
4948            .insert(file.clone(), SystemTime::UNIX_EPOCH);
4949        index.file_sizes.insert(file, 0);
4950        index.set_fingerprint(SemanticIndexFingerprint {
4951            backend: "openai_compatible".to_string(),
4952            model: "test-embedding".to_string(),
4953            base_url: "http://127.0.0.1:1234/v1".to_string(),
4954            dimension: 3,
4955            chunking_version: default_chunking_version(),
4956        });
4957        index.write_to_disk(storage.path(), project_key);
4958
4959        let matching = index.fingerprint().unwrap().as_string();
4960        assert!(SemanticIndex::read_from_disk(
4961            storage.path(),
4962            project_key,
4963            &project_root,
4964            false,
4965            Some(&matching),
4966        )
4967        .is_some());
4968
4969        let mismatched = SemanticIndexFingerprint {
4970            backend: "ollama".to_string(),
4971            model: "embeddinggemma".to_string(),
4972            base_url: "http://127.0.0.1:11434".to_string(),
4973            dimension: 3,
4974            chunking_version: default_chunking_version(),
4975        }
4976        .as_string();
4977        assert!(SemanticIndex::read_from_disk(
4978            storage.path(),
4979            project_key,
4980            &project_root,
4981            false,
4982            Some(&mismatched),
4983        )
4984        .is_none());
4985    }
4986
4987    #[test]
4988    fn read_from_disk_rejects_v3_cache_for_snippet_rebuild() {
4989        let storage = tempfile::tempdir().unwrap();
4990        let project_key = "proj-v3";
4991        let dir = storage.path().join("semantic").join(project_key);
4992        fs::create_dir_all(&dir).unwrap();
4993
4994        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4995        index.entries.push(EmbeddingEntry {
4996            chunk: SemanticChunk {
4997                file: PathBuf::from("/src/main.rs"),
4998                name: "handle_request".to_string(),
4999                kind: SymbolKind::Function,
5000                start_line: 0,
5001                end_line: 0,
5002                exported: true,
5003                embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
5004                snippet: "fn handle_request() {}".to_string(),
5005            },
5006            vector: vec![0.1, 0.2, 0.3],
5007        });
5008        index.dimension = 3;
5009        index
5010            .file_mtimes
5011            .insert(PathBuf::from("/src/main.rs"), SystemTime::UNIX_EPOCH);
5012        index.file_sizes.insert(PathBuf::from("/src/main.rs"), 0);
5013        let fingerprint = SemanticIndexFingerprint {
5014            backend: "fastembed".to_string(),
5015            model: "test".to_string(),
5016            base_url: FALLBACK_BACKEND.to_string(),
5017            dimension: 3,
5018            chunking_version: default_chunking_version(),
5019        };
5020        index.set_fingerprint(fingerprint.clone());
5021
5022        let mut bytes = index.to_bytes();
5023        bytes[0] = SEMANTIC_INDEX_VERSION_V3;
5024        fs::write(dir.join("semantic.bin"), bytes).unwrap();
5025
5026        assert!(SemanticIndex::read_from_disk(
5027            storage.path(),
5028            project_key,
5029            &test_project_root(),
5030            false,
5031            Some(&fingerprint.as_string())
5032        )
5033        .is_none());
5034        assert!(!dir.join("semantic.bin").exists());
5035    }
5036
5037    fn make_symbol(kind: SymbolKind, name: &str, start: u32, end: u32) -> crate::symbols::Symbol {
5038        crate::symbols::Symbol {
5039            name: name.to_string(),
5040            kind,
5041            range: crate::symbols::Range {
5042                start_line: start,
5043                start_col: 0,
5044                end_line: end,
5045                end_col: 0,
5046            },
5047            signature: None,
5048            scope_chain: Vec::new(),
5049            exported: false,
5050            parent: None,
5051        }
5052    }
5053
5054    /// Heading symbols (Markdown / HTML headings) must NOT be indexed —
5055    /// they overwhelmingly dominated semantic results even on code-shaped
5056    /// queries because heading prose embeds far more strongly than code
5057    /// chunks. Skipping headings keeps aft_search a code-finder.
5058    #[test]
5059    fn symbols_to_chunks_skips_heading_symbols() {
5060        let project_root = PathBuf::from("/proj");
5061        let file = project_root.join("README.md");
5062        let source = "# Title\n\nbody text\n\n## Section\n\nmore text\n";
5063
5064        let symbols = vec![
5065            make_symbol(SymbolKind::Heading, "Title", 0, 2),
5066            make_symbol(SymbolKind::Heading, "Section", 4, 6),
5067        ];
5068
5069        let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
5070        assert!(
5071            chunks.is_empty(),
5072            "Heading symbols must be filtered out before embedding; got {} chunk(s)",
5073            chunks.len()
5074        );
5075    }
5076
5077    /// A symbol with an enormous signature (e.g. a YAML/Kubernetes CronJob
5078    /// whose inline `command:` script is parsed into the signature) must not
5079    /// produce an embed_text that overflows the embedding backend's physical
5080    /// batch. Before the clamp, the unbounded `signature:` append created a
5081    /// multi-KB input that aborted the whole index build and degraded every
5082    /// search to lexical-only.
5083    #[test]
5084    fn build_embed_text_clamps_oversized_signature() {
5085        let project_root = PathBuf::from("/proj");
5086        let file = project_root.join("cronjob.yaml");
5087        let huge_sig = "kubectl ".repeat(2000); // ~16 KB
5088        let source = "apiVersion: batch/v1\nkind: CronJob\n";
5089
5090        let mut symbol = make_symbol(SymbolKind::Class, "cluster-janitor", 0, 1);
5091        symbol.signature = Some(huge_sig);
5092
5093        let text = build_embed_text(&symbol, source, &file, &project_root);
5094        assert!(
5095            text.chars().count() <= MAX_EMBED_TEXT_CHARS,
5096            "embed_text must be clamped to {} chars, got {}",
5097            MAX_EMBED_TEXT_CHARS,
5098            text.chars().count()
5099        );
5100    }
5101
5102    /// Code symbols (functions, classes, methods, structs, etc.) must still
5103    /// be indexed alongside the heading skip — otherwise we'd starve the
5104    /// index entirely.
5105    #[test]
5106    fn symbols_to_chunks_keeps_code_symbols_alongside_skipped_headings() {
5107        let project_root = PathBuf::from("/proj");
5108        let file = project_root.join("src/lib.rs");
5109        let source = "pub fn handle_request() -> bool {\n    true\n}\n";
5110
5111        let symbols = vec![
5112            // A heading mixed in (e.g. from a doc comment block elsewhere).
5113            make_symbol(SymbolKind::Heading, "doc heading", 0, 1),
5114            make_symbol(SymbolKind::Function, "handle_request", 0, 2),
5115            make_symbol(SymbolKind::Struct, "AuthService", 4, 6),
5116        ];
5117
5118        let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
5119        assert_eq!(
5120            chunks.len(),
5121            3,
5122            "Expected file-summary + 2 code chunks (Function + Struct), got {}",
5123            chunks.len()
5124        );
5125        let names: Vec<&str> = chunks.iter().map(|c| c.name.as_str()).collect();
5126        assert!(chunks
5127            .iter()
5128            .any(|chunk| matches!(chunk.kind, SymbolKind::FileSummary)));
5129        assert!(names.contains(&"handle_request"));
5130        assert!(names.contains(&"AuthService"));
5131        assert!(
5132            !names.contains(&"doc heading"),
5133            "Heading symbol leaked into chunks: {names:?}"
5134        );
5135    }
5136
5137    #[test]
5138    fn validate_ssrf_allows_loopback_hostnames() {
5139        // Loopback hostnames are explicitly allowed so self-hosted backends
5140        // (Ollama at http://localhost:11434) work at their default config.
5141        for host in &[
5142            "http://localhost",
5143            "http://localhost:8080",
5144            "http://localhost:11434", // Ollama default
5145            "http://localhost.localdomain",
5146            "http://foo.localhost",
5147        ] {
5148            assert!(
5149                validate_base_url_no_ssrf(host).is_ok(),
5150                "Expected {host} to be allowed (loopback), got: {:?}",
5151                validate_base_url_no_ssrf(host)
5152            );
5153        }
5154    }
5155
5156    #[test]
5157    fn validate_ssrf_allows_loopback_ips() {
5158        // 127.0.0.0/8 is loopback — by definition same-machine and not an
5159        // SSRF target. Allow it so Ollama at http://127.0.0.1:11434 works.
5160        for url in &[
5161            "http://127.0.0.1",
5162            "http://127.0.0.1:11434", // Ollama default
5163            "http://127.0.0.1:8080",
5164            "http://127.1.2.3",
5165        ] {
5166            let result = validate_base_url_no_ssrf(url);
5167            assert!(
5168                result.is_ok(),
5169                "Expected {url} to be allowed (loopback), got: {:?}",
5170                result
5171            );
5172        }
5173    }
5174
5175    #[test]
5176    fn validate_ssrf_rejects_private_non_loopback_ips() {
5177        // Non-loopback private/reserved IPs remain rejected — homelab/intranet
5178        // services on LAN IPs are real SSRF targets even though the user
5179        // configured them. Users who want this can opt in by binding the
5180        // service to a public-routable address.
5181        for url in &[
5182            "http://192.168.1.1",
5183            "http://10.0.0.1",
5184            "http://172.16.0.1",
5185            "http://169.254.169.254",
5186            "http://100.64.0.1",
5187        ] {
5188            let result = validate_base_url_no_ssrf(url);
5189            assert!(
5190                result.is_err(),
5191                "Expected {url} to be rejected (non-loopback private), got: {:?}",
5192                result
5193            );
5194        }
5195    }
5196
5197    #[test]
5198    fn validate_ssrf_rejects_mdns_local_hostnames() {
5199        // mDNS .local hostnames typically resolve to LAN devices, not
5200        // loopback. Rejecting them before DNS lookup gives a clearer error.
5201        for host in &[
5202            "http://printer.local",
5203            "http://nas.local:8080",
5204            "http://homelab.local",
5205        ] {
5206            let result = validate_base_url_no_ssrf(host);
5207            assert!(
5208                result.is_err(),
5209                "Expected {host} to be rejected (mDNS), got: {:?}",
5210                result
5211            );
5212        }
5213    }
5214
5215    #[test]
5216    fn normalize_base_url_allows_localhost_for_tests() {
5217        // normalize_base_url itself should NOT block localhost — only
5218        // validate_base_url_no_ssrf does. Tests construct backends directly.
5219        assert!(normalize_base_url("http://127.0.0.1:9999").is_ok());
5220        assert!(normalize_base_url("http://localhost:8080").is_ok());
5221    }
5222
5223    #[test]
5224    fn ssrf_guard_blocks_reserved_ranges_but_allows_loopback() {
5225        use std::net::IpAddr;
5226        let blocked = |s: &str| is_private_non_loopback_ip(&s.parse::<IpAddr>().unwrap());
5227
5228        // Private / link-local / CGNAT — blocked (unchanged behavior).
5229        assert!(blocked("10.0.0.1"));
5230        assert!(blocked("192.168.1.1"));
5231        assert!(blocked("169.254.0.1"));
5232        assert!(blocked("100.64.0.1"));
5233        // Newly covered by delegating to url_fetch's complete list:
5234        assert!(
5235            blocked("198.18.0.1"),
5236            "RFC2544 benchmark range must be blocked"
5237        );
5238        assert!(blocked("224.0.0.1"), "multicast must be blocked");
5239        assert!(blocked("fc00::1"), "IPv6 ULA must be blocked");
5240        assert!(blocked("fe80::1"), "IPv6 link-local must be blocked");
5241
5242        // Loopback — allowed (local Ollama endpoint), incl. IPv4-mapped form.
5243        assert!(!blocked("127.0.0.1"), "loopback must stay allowed");
5244        assert!(!blocked("::1"), "IPv6 loopback must stay allowed");
5245        assert!(
5246            !blocked("::ffff:127.0.0.1"),
5247            "IPv4-mapped loopback must stay allowed (matches prior carve-out)"
5248        );
5249
5250        // A public address must NOT be flagged.
5251        assert!(!blocked("8.8.8.8"));
5252    }
5253
5254    /// Pin the user-facing wording of the ONNX version-mismatch error.
5255    /// The auto-fix path MUST be listed first because it's the only safe
5256    /// option that doesn't require sudo or risk breaking other apps that
5257    /// link the system library. Regression of any of these strings would
5258    /// either mislead users (system rm before auto-fix) or break the
5259    /// `aft doctor --fix` discovery path.
5260    #[test]
5261    fn ort_mismatch_message_recommends_auto_fix_first() {
5262        let msg =
5263            format_ort_version_mismatch("1.9.0", "/usr/lib/x86_64-linux-gnu/libonnxruntime.so");
5264
5265        // The reported version and path must appear verbatim.
5266        assert!(
5267            msg.contains("v1.9.0"),
5268            "should report detected version: {msg}"
5269        );
5270        assert!(
5271            msg.contains("/usr/lib/x86_64-linux-gnu/libonnxruntime.so"),
5272            "should report system path: {msg}"
5273        );
5274        assert!(msg.contains("v1.20+"), "should state requirement: {msg}");
5275
5276        // Solution ordering: auto-fix is #1, system rm is #2, install is #3.
5277        let auto_fix_pos = msg
5278            .find("Auto-fix")
5279            .expect("Auto-fix solution missing — users won't discover --fix");
5280        let remove_pos = msg
5281            .find("Remove the old library")
5282            .expect("system-rm solution missing");
5283        assert!(
5284            auto_fix_pos < remove_pos,
5285            "Auto-fix must come before manual rm — see PR comment thread"
5286        );
5287
5288        // The auto-fix command must be runnable as-is on a fresh system.
5289        assert!(
5290            msg.contains("npx @cortexkit/aft doctor --fix"),
5291            "auto-fix command must be present and copy-pasteable: {msg}"
5292        );
5293    }
5294
5295    #[cfg(any(target_os = "linux", target_os = "macos"))]
5296    #[test]
5297    fn loaded_ort_version_detection_prefers_actual_loaded_library_path() {
5298        let requested = "libonnxruntime.so";
5299        let actual = "/usr/local/lib/libonnxruntime.so.1.19.0";
5300
5301        assert_eq!(detect_ort_version_from_path(requested), None);
5302        let (version, source) =
5303            detect_ort_version_from_resolved_or_requested(Some(actual.to_string()), requested);
5304
5305        assert_eq!(version, Some("1.19.0".to_string()));
5306        assert_eq!(source, actual);
5307
5308        let msg = format_ort_version_mismatch(&version.unwrap(), &source);
5309        assert!(msg.contains("v1.19.0"));
5310        assert!(msg.contains(actual));
5311    }
5312
5313    /// macOS dylib paths must not produce a malformed message when the
5314    /// system path lacks a trailing slash. This is a regression guard
5315    /// for the "{}\n{}" format string contract.
5316    #[test]
5317    fn ort_mismatch_message_handles_macos_dylib_path() {
5318        let msg = format_ort_version_mismatch("1.9.0", "/opt/homebrew/lib/libonnxruntime.dylib");
5319        assert!(msg.contains("v1.9.0"));
5320        assert!(msg.contains("/opt/homebrew/lib/libonnxruntime.dylib"));
5321        // The dylib path must appear in the auto-fix paragraph (single
5322        // quotes around it) AND in the manual-rm paragraph; verify
5323        // both placements survived the format string.
5324        assert!(
5325            msg.contains("'/opt/homebrew/lib/libonnxruntime.dylib'"),
5326            "system path should be quoted in the auto-fix sentence: {msg}"
5327        );
5328    }
5329}