Skip to main content

aft/
semantic_index.rs

1use crate::cache_freshness::{self, FileFreshness, FreshnessVerdict};
2use crate::config::{SemanticBackend, SemanticBackendConfig};
3use crate::fs_lock;
4use crate::parser::{detect_language, extract_symbols_from_tree, grammar_for};
5use crate::search_index::{cache_relative_path, cached_path_under_root};
6use crate::symbols::{Symbol, SymbolKind};
7use crate::{slog_info, slog_warn};
8
9use crate::local_embed::LocalEmbedder;
10use rayon::prelude::*;
11use reqwest::blocking::Client;
12use serde::{Deserialize, Serialize};
13use std::collections::{HashMap, HashSet, VecDeque};
14use std::env;
15use std::fmt::Display;
16use std::fs;
17use std::io::{self, BufReader, BufWriter, Cursor, Read, Write};
18use std::path::{Path, PathBuf};
19use std::sync::Mutex;
20use std::time::Duration;
21use std::time::SystemTime;
22use tree_sitter::Parser;
23use url::Url;
24
25const DEFAULT_DIMENSION: usize = 384;
26const MAX_ENTRIES: usize = 1_000_000;
27// Covers high-dimensional backends such as OpenAI text-embedding-3-large (3072)
28// and common local models (4096) while keeping a bounded supported shape.
29const MAX_DIMENSION: usize = 4096;
30const F32_BYTES: usize = std::mem::size_of::<f32>();
31const HEADER_BYTES_V1: usize = 9;
32const HEADER_BYTES_V2: usize = 13;
33const ONNX_RUNTIME_INSTALL_HINT: &str =
34    "ONNX Runtime not found. Install via: brew install onnxruntime (macOS), \
35     apt install libonnxruntime (Linux), or place onnxruntime.dll in your PATH (Windows). \
36     AFT can auto-download ONNX Runtime — run `npx @cortexkit/aft doctor` to diagnose.";
37
38const SEMANTIC_INDEX_VERSION_V1: u8 = 1;
39const SEMANTIC_INDEX_VERSION_V2: u8 = 2;
40/// V3 adds subsec_nanos to the file-mtime table so staleness detection survives
41/// restart round-trips on filesystems with subsecond mtime precision (APFS,
42/// ext4 with nsec, NTFS). V1/V2 persisted whole-second mtimes only, which
43/// caused every restart to flag ~99% of files as stale and re-embed them.
44const SEMANTIC_INDEX_VERSION_V3: u8 = 3;
45/// V4 keeps the V3 on-disk layout but rebuilds persisted snippets once after
46/// fixing symbol ranges that were incorrectly treated as 1-based.
47const SEMANTIC_INDEX_VERSION_V4: u8 = 4;
48/// V5 adds file sizes to the file metadata table so incremental staleness
49/// detection can catch content changes even when mtime precision misses them.
50const SEMANTIC_INDEX_VERSION_V5: u8 = 5;
51/// V6 stores paths relative to project_root and adds content hashes.
52const SEMANTIC_INDEX_VERSION_V6: u8 = 6;
53const DEFAULT_OPENAI_EMBEDDING_PATH: &str = "/embeddings";
54const DEFAULT_OLLAMA_EMBEDDING_PATH: &str = "/api/embed";
55// Must stay below the bridge timeout (30s) to avoid bridge kills on slow backends.
56const DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS: u64 = 25_000;
57const DEFAULT_MAX_BATCH_SIZE: usize = 64;
58const QUERY_EMBEDDING_CACHE_CAP: usize = 1_000;
59const FALLBACK_BACKEND: &str = "none";
60const EMBEDDING_REQUEST_MAX_ATTEMPTS: usize = 3;
61const EMBEDDING_REQUEST_BACKOFF_MS: [u64; 2] = [500, 1_000];
62static SEMANTIC_LOCK_ACQUIRE_MUTEX: Mutex<()> = Mutex::new(());
63
64pub struct SemanticIndexLock {
65    _guard: fs_lock::LockGuard,
66}
67
68impl SemanticIndexLock {
69    pub fn acquire(storage_dir: &Path, project_key: &str) -> std::io::Result<Self> {
70        let dir = storage_dir.join("semantic").join(project_key);
71        fs::create_dir_all(&dir)?;
72        let path = dir.join("cache.lock");
73        let _acquire_guard = SEMANTIC_LOCK_ACQUIRE_MUTEX
74            .lock()
75            .map_err(|_| std::io::Error::other("semantic cache lock acquisition mutex poisoned"))?;
76        fs_lock::try_acquire(&path, Duration::from_secs(2))
77            .map(|guard| Self { _guard: guard })
78            .map_err(|error| match error {
79                fs_lock::AcquireError::Timeout => {
80                    std::io::Error::other("timed out acquiring semantic cache lock")
81                }
82                fs_lock::AcquireError::Io(error) => error,
83            })
84    }
85}
86
87#[derive(Debug, Clone, Serialize, Deserialize)]
88pub struct SemanticIndexFingerprint {
89    pub backend: String,
90    pub model: String,
91    #[serde(default)]
92    pub base_url: String,
93    pub dimension: usize,
94    #[serde(default = "default_chunking_version")]
95    pub chunking_version: u32,
96}
97
98fn default_chunking_version() -> u32 {
99    2
100}
101
102impl SemanticIndexFingerprint {
103    fn from_config(config: &SemanticBackendConfig, dimension: usize) -> Self {
104        // Use normalized URL for fingerprinting so cosmetic differences
105        // (e.g. "http://host/v1" vs "http://host/v1/") don't cause rebuilds.
106        let base_url = config
107            .base_url
108            .as_ref()
109            .and_then(|u| normalize_base_url(u).ok())
110            .unwrap_or_else(|| FALLBACK_BACKEND.to_string());
111        Self {
112            backend: config.backend.as_str().to_string(),
113            model: config.model.clone(),
114            base_url,
115            dimension,
116            chunking_version: default_chunking_version(),
117        }
118    }
119
120    pub fn as_string(&self) -> String {
121        serde_json::to_string(self).unwrap_or_else(|_| String::new())
122    }
123
124    fn matches_expected(&self, expected: &str) -> bool {
125        let encoded = self.as_string();
126        !encoded.is_empty() && encoded == expected
127    }
128}
129
130enum SemanticEmbeddingEngine {
131    /// Local ONNX embedder (all-MiniLM-L6-v2 via raw `ort`). The config-facing
132    /// backend string stays "fastembed" for index-fingerprint compatibility.
133    Local(LocalEmbedder),
134    OpenAiCompatible {
135        client: Client,
136        model: String,
137        base_url: String,
138        api_key: Option<String>,
139    },
140    Ollama {
141        client: Client,
142        model: String,
143        base_url: String,
144    },
145}
146
147pub struct SemanticEmbeddingModel {
148    backend: SemanticBackend,
149    model: String,
150    base_url: Option<String>,
151    timeout_ms: u64,
152    max_batch_size: usize,
153    dimension: Option<usize>,
154    engine: SemanticEmbeddingEngine,
155    query_embedding_cache: HashMap<String, Vec<f32>>,
156    query_embedding_cache_order: VecDeque<String>,
157    query_embedding_cache_hits: u64,
158    query_embedding_cache_misses: u64,
159}
160
161pub type EmbeddingModel = SemanticEmbeddingModel;
162
163fn validate_embedding_batch(
164    vectors: &[Vec<f32>],
165    expected_count: usize,
166    context: &str,
167) -> Result<(), String> {
168    if expected_count > 0 && vectors.is_empty() {
169        return Err(format!(
170            "{context} returned no vectors for {expected_count} inputs"
171        ));
172    }
173
174    if vectors.len() != expected_count {
175        return Err(format!(
176            "{context} returned {} vectors for {} inputs",
177            vectors.len(),
178            expected_count
179        ));
180    }
181
182    let Some(first_vector) = vectors.first() else {
183        return Ok(());
184    };
185    let expected_dimension = first_vector.len();
186    validate_embedding_dimension(expected_dimension)
187        .map_err(|error| format!("{context} returned {error}"))?;
188    for (index, vector) in vectors.iter().enumerate() {
189        if vector.len() != expected_dimension {
190            return Err(format!(
191                "{context} returned inconsistent embedding dimensions: vector 0 has length {expected_dimension}, vector {index} has length {}",
192                vector.len()
193            ));
194        }
195    }
196
197    Ok(())
198}
199
200fn validate_embedding_dimension(dimension: usize) -> Result<(), String> {
201    if dimension == 0 || dimension > MAX_DIMENSION {
202        return Err(format!(
203            "invalid embedding dimension: {dimension}; supported range is 1..={MAX_DIMENSION}"
204        ));
205    }
206
207    Ok(())
208}
209
210/// Normalize a base URL: validate scheme and strip trailing slash.
211/// Does NOT perform SSRF/private-IP validation — call
212/// `validate_base_url_no_ssrf` separately when processing user-supplied config.
213fn normalize_base_url(raw: &str) -> Result<String, String> {
214    let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
215    let scheme = parsed.scheme();
216    if scheme != "http" && scheme != "https" {
217        return Err(format!(
218            "unsupported URL scheme '{}' — only http:// and https:// are allowed",
219            scheme
220        ));
221    }
222    Ok(parsed.to_string().trim_end_matches('/').to_string())
223}
224
225/// Validate that a base URL does not point to a private/loopback address.
226/// Call this on user-supplied config (at configure time) to prevent SSRF.
227/// Not called for programmatically constructed configs (e.g. tests).
228///
229/// **Loopback is allowed.** Self-hosted embedding backends (e.g. Ollama at
230/// `http://127.0.0.1:11434`) are a primary use case for `aft_search`. Loopback
231/// addresses by definition cannot be exploited as SSRF targets — they only
232/// reach services on the same machine. Allowing loopback unblocks Ollama at its
233/// default config without opening up SSRF to LAN/intranet services, which
234/// remain rejected.
235///
236/// **mDNS `.local` is rejected.** mDNS hostnames typically resolve to LAN
237/// devices (printers, homelab servers); rejecting them before DNS lookup keeps
238/// the SSRF guard meaningful for non-loopback private networks.
239pub fn validate_base_url_no_ssrf(raw: &str) -> Result<(), String> {
240    use std::net::{IpAddr, ToSocketAddrs};
241
242    let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
243
244    let host = parsed.host_str().unwrap_or("");
245
246    // Loopback hostnames are explicitly allowed. RFC 6761 mandates that
247    // `localhost` and `*.localhost` resolve to loopback;
248    // `localhost.localdomain` is a historical alias used on some Linux
249    // distros. Self-hosted backends like Ollama use these by default.
250    let is_loopback_host =
251        host == "localhost" || host == "localhost.localdomain" || host.ends_with(".localhost");
252    if is_loopback_host {
253        return Ok(());
254    }
255
256    // mDNS hostnames are typically LAN devices, not loopback. Reject before
257    // DNS lookup so users get a clear error rather than a private-IP error.
258    if host.ends_with(".local") {
259        return Err(format!(
260            "base_url host '{host}' is an mDNS name — only loopback (localhost / 127.0.0.1) and public endpoints are allowed"
261        ));
262    }
263
264    // Resolve the hostname. Reject private/link-local/CGNAT IPs but NOT
265    // loopback (which is by definition same-machine and not an SSRF target).
266    let port = parsed.port_or_known_default().unwrap_or(443);
267    let addr_str = format!("{host}:{port}");
268    let addrs: Vec<IpAddr> = addr_str
269        .to_socket_addrs()
270        .map(|iter| iter.map(|sa| sa.ip()).collect())
271        .unwrap_or_default();
272    for ip in &addrs {
273        if is_private_non_loopback_ip(ip) {
274            return Err(format!(
275                "base_url '{raw}' resolves to a private/reserved IP — only loopback (127.0.0.1) and public endpoints are allowed"
276            ));
277        }
278    }
279
280    Ok(())
281}
282
283/// Returns true for IPv4/IPv6 addresses in private/link-local/CGNAT/benchmark/
284/// multicast/reserved ranges, EXCLUDING loopback (127.0.0.0/8 and ::1). Loopback
285/// is considered safe for SSRF purposes (same-machine, e.g. a local Ollama
286/// endpoint) — see [`validate_base_url_no_ssrf`] for rationale.
287///
288/// Delegates to [`crate::url_fetch::is_private_or_reserved_ip`] so there is one
289/// authoritative reserved-range list (the url_fetch copy is the maintained one;
290/// this used to be a drifting subset that missed e.g. 198.18.0.0/15 and the
291/// multicast/reserved blocks). We only re-add the loopback carve-out the
292/// url_fetch guard deliberately does not make.
293fn is_private_non_loopback_ip(ip: &std::net::IpAddr) -> bool {
294    // Canonicalize so an IPv4-mapped loopback (`::ffff:127.0.0.1`) is also
295    // recognized as loopback, matching the prior carve-out.
296    if ip.to_canonical().is_loopback() {
297        return false;
298    }
299    crate::url_fetch::is_private_or_reserved_ip(*ip)
300}
301
302fn build_openai_embeddings_endpoint(base_url: &str) -> String {
303    if base_url.ends_with("/v1") {
304        format!("{base_url}{DEFAULT_OPENAI_EMBEDDING_PATH}")
305    } else {
306        format!("{base_url}/v1{}", DEFAULT_OPENAI_EMBEDDING_PATH)
307    }
308}
309
310fn build_ollama_embeddings_endpoint(base_url: &str) -> String {
311    if base_url.ends_with("/api") {
312        format!("{base_url}/embed")
313    } else {
314        format!("{base_url}{DEFAULT_OLLAMA_EMBEDDING_PATH}")
315    }
316}
317
318fn normalize_api_key(value: Option<String>) -> Option<String> {
319    value.and_then(|token| {
320        let token = token.trim();
321        if token.is_empty() {
322            None
323        } else {
324            Some(token.to_string())
325        }
326    })
327}
328
329fn is_retryable_embedding_status(status: reqwest::StatusCode) -> bool {
330    status.is_server_error() || status == reqwest::StatusCode::TOO_MANY_REQUESTS
331}
332
333/// Local backends (LM Studio, Ollama, llama.cpp) can return a 4xx — usually
334/// 400/409 — while a model is loading or was just unloaded. Only narrowly known
335/// local-backend loading/unloaded payloads are classified transient; generic
336/// 4xx bodies that merely mention phrases like "loading model" remain
337/// permanent so misconfigurations do not retry forever.
338fn embedding_response_body_is_transient(status: reqwest::StatusCode, raw: &str) -> bool {
339    if !matches!(
340        status,
341        reqwest::StatusCode::BAD_REQUEST
342            | reqwest::StatusCode::CONFLICT
343            | reqwest::StatusCode::REQUEST_TIMEOUT
344            | reqwest::StatusCode::LOCKED
345            | reqwest::StatusCode::TOO_EARLY
346    ) {
347        return false;
348    }
349
350    let lower = raw.to_ascii_lowercase();
351    let normalized = lower.trim();
352
353    normalized.contains("model was unloaded while the request was still in queue")
354        || normalized == "model is loading"
355        || normalized.starts_with("model is loading,")
356        || normalized.contains(r#""error":"model is loading"#)
357        || normalized.contains(r#""message":"model is loading"#)
358        || normalized == "model not loaded"
359        || normalized.contains(r#""error":"model not loaded""#)
360        || normalized.contains(r#""message":"model not loaded""#)
361        || normalized == "loading model into memory"
362        || normalized.contains(r#""error":"loading model into memory""#)
363        || normalized.contains(r#""message":"loading model into memory""#)
364        || normalized == "model is being loaded"
365        || normalized.contains(r#""error":"model is being loaded""#)
366        || normalized.contains(r#""message":"model is being loaded""#)
367        || normalized == "model is currently loading"
368        || normalized.contains(r#""error":"model is currently loading""#)
369        || normalized.contains(r#""message":"model is currently loading""#)
370}
371
372fn is_retryable_embedding_error(error: &reqwest::Error) -> bool {
373    error.is_connect()
374}
375
376/// Whether a send-time error means the backend is *unreachable or temporarily
377/// failing* (vs. a real misconfiguration). Broader than the in-request retry
378/// predicate: a per-request timeout is transient for the build/refresh layer
379/// (the model may still be cold-loading) but we don't burn the 3 fast
380/// in-request attempts on it — the build-level retry rides it out instead.
381fn embedding_send_error_is_transient(error: &reqwest::Error) -> bool {
382    error.is_connect() || error.is_timeout()
383}
384
385fn embedding_response_read_error_is_transient(error: &reqwest::Error) -> bool {
386    embedding_send_error_is_transient(error) || error.is_body() || error.is_decode()
387}
388
389/// Stable machine marker prefixed onto embedding error strings whose root cause
390/// is transient — the backend is down, timing out, or returning 5xx/429, not
391/// misconfigured. The build and corpus-refresh layers key retry-vs-give-up on
392/// this marker (see [`embedding_failure_is_transient`]) instead of re-parsing
393/// error text, so transience stays authoritative at the one site that knows it.
394/// Stripped before any user-facing display via [`strip_transient_embedding_marker`].
395pub const TRANSIENT_EMBEDDING_MARKER: &str = "[transient] ";
396
397/// True when an embedding error carries the transient marker — i.e. retrying
398/// once the backend recovers is the right move, not surfacing a hard failure.
399pub fn embedding_failure_is_transient(error: &str) -> bool {
400    error.contains(TRANSIENT_EMBEDDING_MARKER)
401}
402
403/// Remove the machine transient marker so the message is clean for display.
404pub fn strip_transient_embedding_marker(error: &str) -> String {
405    error.replace(TRANSIENT_EMBEDDING_MARKER, "")
406}
407
408fn sleep_before_embedding_retry(attempt_index: usize) {
409    if let Some(delay_ms) = EMBEDDING_REQUEST_BACKOFF_MS.get(attempt_index) {
410        std::thread::sleep(Duration::from_millis(*delay_ms));
411    }
412}
413
414fn send_embedding_request<F>(mut make_request: F, backend_label: &str) -> Result<String, String>
415where
416    F: FnMut() -> reqwest::blocking::RequestBuilder,
417{
418    for attempt_index in 0..EMBEDDING_REQUEST_MAX_ATTEMPTS {
419        let last_attempt = attempt_index + 1 == EMBEDDING_REQUEST_MAX_ATTEMPTS;
420
421        let response = match make_request().send() {
422            Ok(response) => response,
423            Err(error) => {
424                if !last_attempt && is_retryable_embedding_error(&error) {
425                    sleep_before_embedding_retry(attempt_index);
426                    continue;
427                }
428                // Connect/timeout failures mean the backend is unreachable or
429                // cold-loading — mark transient so the build layer rides it out
430                // and self-heals instead of parking the index in `Failed`.
431                let marker = if embedding_send_error_is_transient(&error) {
432                    TRANSIENT_EMBEDDING_MARKER
433                } else {
434                    ""
435                };
436                return Err(format!("{marker}{backend_label} request failed: {error}"));
437            }
438        };
439
440        let status = response.status();
441        let raw = match response.text() {
442            Ok(raw) => raw,
443            Err(error) => {
444                if !last_attempt && embedding_response_read_error_is_transient(&error) {
445                    sleep_before_embedding_retry(attempt_index);
446                    continue;
447                }
448                let marker = if embedding_response_read_error_is_transient(&error) {
449                    TRANSIENT_EMBEDDING_MARKER
450                } else {
451                    ""
452                };
453                return Err(format!(
454                    "{marker}{backend_label} response read failed: {error}"
455                ));
456            }
457        };
458
459        if status.is_success() {
460            return Ok(raw);
461        }
462
463        // A 4xx whose body says the model is loading/unloaded is transient on
464        // local backends (LM Studio/Ollama), so treat it like a retryable
465        // status: ride it out at both the in-request and build-retry layers.
466        let body_transient = embedding_response_body_is_transient(status, &raw);
467        if !last_attempt && (is_retryable_embedding_status(status) || body_transient) {
468            sleep_before_embedding_retry(attempt_index);
469            continue;
470        }
471
472        // 5xx / 429 are server-side and transient — the backend is overloaded
473        // or briefly unavailable, not misconfigured. A 4xx whose body indicates
474        // the model is (un)loading is also transient (local backend mid-swap).
475        // Other 4xx (auth, bad request, model-not-found) is a real error the
476        // user must fix; no marker.
477        let marker = if is_retryable_embedding_status(status) || body_transient {
478            TRANSIENT_EMBEDDING_MARKER
479        } else {
480            ""
481        };
482        return Err(format!(
483            "{marker}{backend_label} request failed (HTTP {}): {}",
484            status, raw
485        ));
486    }
487
488    unreachable!("embedding request retries exhausted without returning")
489}
490
491impl SemanticEmbeddingModel {
492    pub fn from_config(config: &SemanticBackendConfig) -> Result<Self, String> {
493        let timeout_ms = if config.timeout_ms == 0 {
494            DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS
495        } else {
496            config.timeout_ms
497        };
498
499        let max_batch_size = if config.max_batch_size == 0 {
500            DEFAULT_MAX_BATCH_SIZE
501        } else {
502            config.max_batch_size
503        };
504
505        let api_key_env = normalize_api_key(config.api_key_env.clone());
506        let model = config.model.clone();
507
508        let client = Client::builder()
509            .timeout(Duration::from_millis(timeout_ms))
510            .redirect(reqwest::redirect::Policy::none())
511            .build()
512            .map_err(|error| format!("failed to configure embedding client: {error}"))?;
513
514        let engine = match config.backend {
515            SemanticBackend::Fastembed => {
516                SemanticEmbeddingEngine::Local(LocalEmbedder::new(&model)?)
517            }
518            SemanticBackend::OpenAiCompatible => {
519                let raw = config.base_url.as_ref().ok_or_else(|| {
520                    "base_url is required for openai_compatible backend".to_string()
521                })?;
522                let base_url = normalize_base_url(raw)?;
523
524                let api_key = match api_key_env {
525                    Some(var_name) => Some(env::var(&var_name).map_err(|_| {
526                        format!("missing api_key_env '{var_name}' for openai_compatible backend")
527                    })?),
528                    None => None,
529                };
530
531                SemanticEmbeddingEngine::OpenAiCompatible {
532                    client,
533                    model,
534                    base_url,
535                    api_key,
536                }
537            }
538            SemanticBackend::Ollama => {
539                let raw = config
540                    .base_url
541                    .as_ref()
542                    .ok_or_else(|| "base_url is required for ollama backend".to_string())?;
543                let base_url = normalize_base_url(raw)?;
544
545                SemanticEmbeddingEngine::Ollama {
546                    client,
547                    model,
548                    base_url,
549                }
550            }
551        };
552
553        Ok(Self {
554            backend: config.backend,
555            model: config.model.clone(),
556            base_url: config.base_url.clone(),
557            timeout_ms,
558            max_batch_size,
559            dimension: None,
560            engine,
561            query_embedding_cache: HashMap::new(),
562            query_embedding_cache_order: VecDeque::new(),
563            query_embedding_cache_hits: 0,
564            query_embedding_cache_misses: 0,
565        })
566    }
567
568    pub fn backend(&self) -> SemanticBackend {
569        self.backend
570    }
571
572    pub fn model(&self) -> &str {
573        &self.model
574    }
575
576    pub fn base_url(&self) -> Option<&str> {
577        self.base_url.as_deref()
578    }
579
580    pub fn max_batch_size(&self) -> usize {
581        self.max_batch_size
582    }
583
584    pub fn timeout_ms(&self) -> u64 {
585        self.timeout_ms
586    }
587
588    pub fn fingerprint(
589        &mut self,
590        config: &SemanticBackendConfig,
591    ) -> Result<SemanticIndexFingerprint, String> {
592        let dimension = self.dimension()?;
593        Ok(SemanticIndexFingerprint::from_config(config, dimension))
594    }
595
596    pub fn dimension(&mut self) -> Result<usize, String> {
597        if let Some(dimension) = self.dimension {
598            return Ok(dimension);
599        }
600
601        let dimension = match &mut self.engine {
602            SemanticEmbeddingEngine::Local(model) => {
603                let vectors = model.embed(&["semantic index fingerprint probe".to_string()])?;
604                vectors
605                    .first()
606                    .map(|v| v.len())
607                    .ok_or_else(|| "embedding backend returned no vectors".to_string())?
608            }
609            SemanticEmbeddingEngine::OpenAiCompatible { .. } => {
610                let vectors =
611                    self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
612                vectors
613                    .first()
614                    .map(|v| v.len())
615                    .ok_or_else(|| "embedding backend returned no vectors".to_string())?
616            }
617            SemanticEmbeddingEngine::Ollama { .. } => {
618                let vectors =
619                    self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
620                vectors
621                    .first()
622                    .map(|v| v.len())
623                    .ok_or_else(|| "embedding backend returned no vectors".to_string())?
624            }
625        };
626
627        self.dimension = Some(dimension);
628        Ok(dimension)
629    }
630
631    pub fn embed(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
632        self.embed_texts(texts)
633    }
634
635    pub fn embed_query_cached(&mut self, query: &str) -> Result<Vec<f32>, String> {
636        if let Some(vector) = self.query_embedding_cache.get(query) {
637            self.query_embedding_cache_hits += 1;
638            return Ok(vector.clone());
639        }
640
641        self.query_embedding_cache_misses += 1;
642        let embeddings = self.embed_texts(vec![query.to_string()])?;
643        let vector = embeddings
644            .first()
645            .cloned()
646            .ok_or_else(|| "embedding model returned no query vector".to_string())?;
647
648        if self.query_embedding_cache.len() >= QUERY_EMBEDDING_CACHE_CAP {
649            if let Some(oldest) = self.query_embedding_cache_order.pop_front() {
650                self.query_embedding_cache.remove(&oldest);
651            }
652        }
653        self.query_embedding_cache
654            .insert(query.to_string(), vector.clone());
655        self.query_embedding_cache_order
656            .push_back(query.to_string());
657
658        Ok(vector)
659    }
660
661    pub fn query_embedding_cache_stats(&self) -> (u64, u64, usize) {
662        (
663            self.query_embedding_cache_hits,
664            self.query_embedding_cache_misses,
665            self.query_embedding_cache.len(),
666        )
667    }
668
669    fn embed_texts(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
670        match &mut self.engine {
671            SemanticEmbeddingEngine::Local(model) => model
672                .embed(&texts)
673                .map_err(|error| format!("failed to embed batch: {error}")),
674            SemanticEmbeddingEngine::OpenAiCompatible {
675                client,
676                model,
677                base_url,
678                api_key,
679            } => {
680                let expected_text_count = texts.len();
681                let endpoint = build_openai_embeddings_endpoint(base_url);
682                let body = serde_json::json!({
683                    "input": texts,
684                    "model": model,
685                });
686
687                let raw = send_embedding_request(
688                    || {
689                        // `.json(&body)` sets Content-Type: application/json
690                        // automatically. Do NOT add `.header("Content-Type",
691                        // "application/json")` afterwards — RequestBuilder::header()
692                        // calls HeaderMap::append, which produces TWO Content-Type
693                        // headers on the wire. OpenAI's /v1/embeddings endpoint
694                        // treats duplicate Content-Type as malformed and rejects
695                        // the body with 400 "you must provide a model parameter"
696                        // even when `model` is set. Verified end-to-end against
697                        // api.openai.com. See issue #36.
698                        let mut request = client.post(&endpoint).json(&body);
699
700                        if let Some(api_key) = api_key {
701                            request = request.header("Authorization", format!("Bearer {api_key}"));
702                        }
703
704                        request
705                    },
706                    "openai compatible",
707                )?;
708
709                #[derive(Deserialize)]
710                struct OpenAiResponse {
711                    data: Vec<OpenAiEmbeddingResult>,
712                }
713
714                #[derive(Deserialize)]
715                struct OpenAiEmbeddingResult {
716                    embedding: Vec<f32>,
717                    index: Option<u32>,
718                }
719
720                let parsed: OpenAiResponse = serde_json::from_str(&raw)
721                    .map_err(|error| format!("invalid openai compatible response: {error}"))?;
722                if parsed.data.len() != expected_text_count {
723                    return Err(format!(
724                        "openai compatible response returned {} embeddings for {} inputs",
725                        parsed.data.len(),
726                        expected_text_count
727                    ));
728                }
729
730                let mut vectors = vec![Vec::new(); parsed.data.len()];
731                for (i, item) in parsed.data.into_iter().enumerate() {
732                    let index = item.index.unwrap_or(i as u32) as usize;
733                    if index >= vectors.len() {
734                        return Err(
735                            "openai compatible response contains invalid vector index".to_string()
736                        );
737                    }
738                    vectors[index] = item.embedding;
739                }
740
741                for vector in &vectors {
742                    if vector.is_empty() {
743                        return Err(
744                            "openai compatible response contained missing vectors".to_string()
745                        );
746                    }
747                }
748
749                self.dimension = vectors.first().map(Vec::len);
750                Ok(vectors)
751            }
752            SemanticEmbeddingEngine::Ollama {
753                client,
754                model,
755                base_url,
756            } => {
757                let expected_text_count = texts.len();
758                let endpoint = build_ollama_embeddings_endpoint(base_url);
759
760                #[derive(Serialize)]
761                struct OllamaPayload<'a> {
762                    model: &'a str,
763                    input: Vec<String>,
764                }
765
766                let payload = OllamaPayload {
767                    model,
768                    input: texts,
769                };
770
771                let raw = send_embedding_request(
772                    || {
773                        // `.json(&payload)` sets Content-Type automatically.
774                        // Same duplicate-header trap as the OpenAI branch above
775                        // — most Ollama servers tolerate it, but the
776                        // single-Content-Type form is the correct one.
777                        client.post(&endpoint).json(&payload)
778                    },
779                    "ollama",
780                )?;
781
782                #[derive(Deserialize)]
783                struct OllamaResponse {
784                    embeddings: Vec<Vec<f32>>,
785                }
786
787                let parsed: OllamaResponse = serde_json::from_str(&raw)
788                    .map_err(|error| format!("invalid ollama response: {error}"))?;
789                if parsed.embeddings.is_empty() {
790                    return Err("ollama response returned no embeddings".to_string());
791                }
792                if parsed.embeddings.len() != expected_text_count {
793                    return Err(format!(
794                        "ollama response returned {} embeddings for {} inputs",
795                        parsed.embeddings.len(),
796                        expected_text_count
797                    ));
798                }
799
800                let vectors = parsed.embeddings;
801                for vector in &vectors {
802                    if vector.is_empty() {
803                        return Err("ollama response contained empty embeddings".to_string());
804                    }
805                }
806
807                self.dimension = vectors.first().map(Vec::len);
808                Ok(vectors)
809            }
810        }
811    }
812}
813
814/// Pre-validate ONNX Runtime by attempting a raw dlopen before ort touches it.
815/// This catches broken/incompatible .so files without risking a panic in the ort crate.
816/// Also checks the runtime version via OrtGetApiBase if available.
817pub fn pre_validate_onnx_runtime() -> Result<(), String> {
818    let dylib_path = std::env::var("ORT_DYLIB_PATH").ok();
819
820    #[cfg(any(target_os = "linux", target_os = "macos"))]
821    {
822        #[cfg(target_os = "linux")]
823        let default_name = "libonnxruntime.so";
824        #[cfg(target_os = "macos")]
825        let default_name = "libonnxruntime.dylib";
826
827        let lib_name = dylib_path.as_deref().unwrap_or(default_name);
828
829        unsafe {
830            let c_name = std::ffi::CString::new(lib_name)
831                .map_err(|e| format!("invalid library path: {}", e))?;
832            let handle = libc::dlopen(c_name.as_ptr(), libc::RTLD_NOW);
833            if handle.is_null() {
834                let err = libc::dlerror();
835                let msg = if err.is_null() {
836                    "unknown dlopen error".to_string()
837                } else {
838                    std::ffi::CStr::from_ptr(err).to_string_lossy().into_owned()
839                };
840                return Err(format!(
841                    "ONNX Runtime not found. dlopen('{}') failed: {}. \
842                     Run `npx @cortexkit/aft doctor` to diagnose.",
843                    lib_name, msg
844                ));
845            }
846
847            // Try to detect the runtime version from the actual loaded library
848            // path first. A bare dlopen("libonnxruntime.so") may resolve to an
849            // older system ORT through loader search paths; checking only the
850            // caller-supplied soname would miss that and let ort fail opaquely.
851            let (detected_version, version_source) =
852                detect_ort_version_from_loaded_library(handle, lib_name);
853
854            libc::dlclose(handle);
855
856            // Check version compatibility — we need 1.20+.
857            if let Some(ref version) = detected_version {
858                let parts: Vec<&str> = version.split('.').collect();
859                if let (Some(major), Some(minor)) = (
860                    parts.first().and_then(|s| s.parse::<u32>().ok()),
861                    parts.get(1).and_then(|s| s.parse::<u32>().ok()),
862                ) {
863                    if major != 1 || minor < 20 {
864                        return Err(format_ort_version_mismatch(version, &version_source));
865                    }
866                }
867            }
868        }
869    }
870
871    #[cfg(target_os = "windows")]
872    {
873        // Validate ONNX Runtime availability on Windows by loading the DLL
874        // via LoadLibraryExW before the ort crate attempts its own LoadLibrary.
875        // This way we can produce a friendly error (with installation hints)
876        // instead of a raw LoadLibrary failure from deep inside fastembed.
877        let lib_name = dylib_path.as_deref().unwrap_or("onnxruntime.dll");
878
879        // Use kernel32 LoadLibraryExW for the validation — built-in, no
880        // crate dependency required. GetModuleFileNameW resolves the loaded
881        // DLL path for version probing via the version.dll API.
882        #[link(name = "kernel32")]
883        extern "system" {
884            fn LoadLibraryExW(
885                lpLibFileName: *const u16,
886                hFile: *mut std::ffi::c_void,
887                dwFlags: u32,
888            ) -> *mut std::ffi::c_void;
889            fn FreeLibrary(hLibModule: *mut std::ffi::c_void) -> i32;
890            fn GetModuleFileNameW(
891                hModule: *mut std::ffi::c_void,
892                lpFilename: *mut u16,
893                nSize: u32,
894            ) -> u32;
895        }
896
897        #[link(name = "version")]
898        extern "system" {
899            fn GetFileVersionInfoSizeW(lptstrFilename: *const u16, lpdwHandle: *mut u32) -> u32;
900            fn GetFileVersionInfoW(
901                lptstrFilename: *const u16,
902                dwHandle: u32,
903                dwLen: u32,
904                lpData: *mut std::ffi::c_void,
905            ) -> i32;
906            fn VerQueryValueW(
907                pBlock: *mut std::ffi::c_void,
908                lpSubBlock: *const u16,
909                lplpBuffer: *mut *mut std::ffi::c_void,
910                puLen: *mut u32,
911            ) -> i32;
912        }
913
914        #[repr(C)]
915        struct VS_FIXEDFILEINFO {
916            dw_signature: u32,
917            dw_struc_version: u32,
918            dw_file_version_ms: u32, // HIWORD major, LOWORD minor
919            dw_file_version_ls: u32, // HIWORD build, LOWORD revision
920            dw_product_version_ms: u32,
921            dw_product_version_ls: u32,
922            dw_file_flags_mask: u32,
923            dw_file_flags: u32,
924            dw_file_os: u32,
925            dw_file_type: u32,
926            dw_file_subtype: u32,
927            dw_file_date_ms: u32,
928            dw_file_date_ls: u32,
929        }
930
931        unsafe {
932            use std::os::windows::ffi::OsStrExt;
933            let wide: Vec<u16> = std::ffi::OsStr::new(lib_name)
934                .encode_wide()
935                .chain(std::iter::once(0))
936                .collect();
937
938            let handle = LoadLibraryExW(wide.as_ptr(), std::ptr::null_mut(), 0);
939            if handle.is_null() {
940                let err = std::io::Error::last_os_error();
941                return Err(format!(
942                    "ONNX Runtime not found. LoadLibraryExW('{}') failed: {}. \
943                     Run `npx @cortexkit/aft doctor` to diagnose.",
944                    lib_name, err
945                ));
946            }
947
948            // Probe the file version from PE resources so we can reject
949            // outdated DLLs (e.g. v1.9.x) before the ort crate panics.
950            let mut detected_major: u32 = 0;
951            let mut detected_minor: u32 = 0;
952            // Use MAX_UNICODEPATH (32767) so deeply nested ORT paths (e.g.
953            // long NuGet package paths under %USERPROFILE%) never truncate.
954            // GetModuleFileNameW truncates silently when the buffer is too
955            // small, which causes version probing to fail and the version
956            // check to be bypassed — better to allocate generously.
957            let mut path_buf = [0u16; 32767];
958            let path_len = GetModuleFileNameW(handle, path_buf.as_mut_ptr(), 32767);
959            if path_len > 0 {
960                let mut dummy_handle: u32 = 0;
961                let info_size = GetFileVersionInfoSizeW(path_buf.as_ptr(), &mut dummy_handle);
962                if info_size > 0 {
963                    let mut info = vec![0u8; info_size as usize];
964                    if GetFileVersionInfoW(
965                        path_buf.as_ptr(),
966                        0,
967                        info_size,
968                        info.as_mut_ptr() as *mut std::ffi::c_void,
969                    ) != 0
970                    {
971                        let sub_block = "\\\0".encode_utf16().collect::<Vec<u16>>();
972                        let mut vs_info: *mut std::ffi::c_void = std::ptr::null_mut();
973                        let mut vs_len: u32 = 0;
974                        if VerQueryValueW(
975                            info.as_mut_ptr() as *mut std::ffi::c_void,
976                            sub_block.as_ptr(),
977                            &mut vs_info,
978                            &mut vs_len,
979                        ) != 0
980                            && !vs_info.is_null()
981                        {
982                            let fixed = vs_info as *const VS_FIXEDFILEINFO;
983                            detected_major = (*fixed).dw_file_version_ms >> 16;
984                            detected_minor = (*fixed).dw_file_version_ms & 0xFFFF;
985                        }
986                    }
987                }
988            }
989
990            FreeLibrary(handle);
991
992            // Version compatibility check (mirrors the Linux/macOS path).
993            // If version could not be detected (detected_major == 0) we let
994            // the load succeed — the ort crate will diagnose further.
995            if detected_major != 0 && (detected_major != 1 || detected_minor < 20) {
996                let ver = format!("{}.{}", detected_major, detected_minor);
997                return Err(format_ort_version_mismatch(&ver, lib_name));
998            }
999        }
1000    }
1001
1002    Ok(())
1003}
1004
1005#[cfg(any(target_os = "linux", target_os = "macos"))]
1006unsafe fn loaded_library_path_from_handle(handle: *mut std::ffi::c_void) -> Option<String> {
1007    let symbol_name = std::ffi::CString::new("OrtGetApiBase").ok()?;
1008    let symbol = unsafe { libc::dlsym(handle, symbol_name.as_ptr()) };
1009    if symbol.is_null() {
1010        return None;
1011    }
1012
1013    let mut info = std::mem::MaybeUninit::<libc::Dl_info>::uninit();
1014    if unsafe { libc::dladdr(symbol, info.as_mut_ptr()) } == 0 {
1015        return None;
1016    }
1017
1018    let info = unsafe { info.assume_init() };
1019    if info.dli_fname.is_null() {
1020        return None;
1021    }
1022
1023    Some(
1024        unsafe { std::ffi::CStr::from_ptr(info.dli_fname) }
1025            .to_string_lossy()
1026            .into_owned(),
1027    )
1028}
1029
1030#[cfg(any(target_os = "linux", target_os = "macos"))]
1031fn detect_ort_version_from_resolved_or_requested(
1032    resolved_path: Option<String>,
1033    requested_lib_name: &str,
1034) -> (Option<String>, String) {
1035    if let Some(path) = resolved_path {
1036        if let Some(version) = detect_ort_version_from_path(&path) {
1037            return (Some(version), path);
1038        }
1039        return (detect_ort_version_from_path(requested_lib_name), path);
1040    }
1041
1042    (
1043        detect_ort_version_from_path(requested_lib_name),
1044        requested_lib_name.to_string(),
1045    )
1046}
1047
1048#[cfg(any(target_os = "linux", target_os = "macos"))]
1049fn detect_ort_version_from_loaded_library(
1050    handle: *mut std::ffi::c_void,
1051    requested_lib_name: &str,
1052) -> (Option<String>, String) {
1053    detect_ort_version_from_resolved_or_requested(
1054        unsafe { loaded_library_path_from_handle(handle) },
1055        requested_lib_name,
1056    )
1057}
1058
1059/// Try to extract the ORT version from the library filename or resolved symlink.
1060/// Examples: "libonnxruntime.so.1.19.0" → "1.19.0", "libonnxruntime.1.24.4.dylib" → "1.24.4"
1061#[cfg(any(target_os = "linux", target_os = "macos"))]
1062fn detect_ort_version_from_path(lib_path: &str) -> Option<String> {
1063    let path = std::path::Path::new(lib_path);
1064
1065    // Try the path as given, then follow symlinks
1066    for candidate in [Some(path.to_path_buf()), std::fs::canonicalize(path).ok()]
1067        .into_iter()
1068        .flatten()
1069    {
1070        if let Some(name) = candidate.file_name().and_then(|n| n.to_str()) {
1071            if let Some(version) = extract_version_from_filename(name) {
1072                return Some(version);
1073            }
1074        }
1075    }
1076
1077    // Also check for versioned siblings in the same directory
1078    if let Some(parent) = path.parent() {
1079        if let Ok(entries) = std::fs::read_dir(parent) {
1080            for entry in entries.flatten() {
1081                if let Some(name) = entry.file_name().to_str() {
1082                    if name.starts_with("libonnxruntime") {
1083                        if let Some(version) = extract_version_from_filename(name) {
1084                            return Some(version);
1085                        }
1086                    }
1087                }
1088            }
1089        }
1090    }
1091
1092    None
1093}
1094
1095/// Extract version from filenames like "libonnxruntime.so.1.19.0" or "libonnxruntime.1.24.4.dylib"
1096#[cfg(any(target_os = "linux", target_os = "macos"))]
1097fn extract_version_from_filename(name: &str) -> Option<String> {
1098    // Match patterns: .so.X.Y.Z or .X.Y.Z.dylib or .X.Y.Z.so
1099    let re = regex::Regex::new(r"(\d+\.\d+\.\d+)").ok()?;
1100    re.find(name).map(|m| m.as_str().to_string())
1101}
1102
1103fn suggest_removal_command(lib_path: &str) -> String {
1104    if lib_path.starts_with("/usr/local/lib")
1105        || lib_path == "libonnxruntime.so"
1106        || lib_path == "libonnxruntime.dylib"
1107    {
1108        #[cfg(target_os = "linux")]
1109        return "   sudo rm /usr/local/lib/libonnxruntime* && sudo ldconfig".to_string();
1110        #[cfg(target_os = "macos")]
1111        return "   sudo rm /usr/local/lib/libonnxruntime*".to_string();
1112    }
1113    format!("   rm '{}'", lib_path)
1114}
1115
1116/// Build the user-facing error message for an incompatible ONNX Runtime
1117/// install. Extracted as a pure helper so we can unit-test the wording
1118/// stability — the auto-fix recommendation must always come first because
1119/// it's the only safe option, and the system-rm step must remain present
1120/// because some users prefer the system-wide cleanup path.
1121pub(crate) fn format_ort_version_mismatch(version: &str, lib_name: &str) -> String {
1122    format!(
1123        "ONNX Runtime version mismatch: found v{} at '{}', but AFT requires v1.20+. \
1124         Solutions:\n\
1125         1. Auto-fix (recommended): run `npx @cortexkit/aft doctor --fix`. \
1126         This downloads AFT-managed ONNX Runtime v1.24 into AFT's storage and \
1127         configures the bridge to load it instead of the system library — no \
1128         changes to '{}'.\n\
1129         2. Remove the old library and restart (AFT auto-downloads the correct version on next start):\n\
1130         {}\n\
1131         3. Or install ONNX Runtime 1.24 system-wide: https://github.com/microsoft/onnxruntime/releases/tag/v1.24.0\n\
1132         4. Run `npx @cortexkit/aft doctor` for full diagnostics.",
1133        version,
1134        lib_name,
1135        lib_name,
1136        suggest_removal_command(lib_name),
1137    )
1138}
1139
1140pub fn is_onnx_runtime_unavailable(message: &str) -> bool {
1141    if message.trim_start().starts_with("ONNX Runtime not found.") {
1142        return true;
1143    }
1144
1145    let message = message.to_ascii_lowercase();
1146    let mentions_onnx_runtime = ["onnx runtime", "onnxruntime", "libonnxruntime"]
1147        .iter()
1148        .any(|pattern| message.contains(pattern));
1149    let mentions_dynamic_load_failure = [
1150        "shared library",
1151        "dynamic library",
1152        "failed to load",
1153        "could not load",
1154        "unable to load",
1155        "dlopen",
1156        "loadlibrary",
1157        "no such file",
1158        "not found",
1159    ]
1160    .iter()
1161    .any(|pattern| message.contains(pattern));
1162
1163    mentions_onnx_runtime && mentions_dynamic_load_failure
1164}
1165
1166pub fn format_embedding_init_error(error: impl Display) -> String {
1167    let message = error.to_string();
1168
1169    if is_onnx_runtime_unavailable(&message) {
1170        return format!("{ONNX_RUNTIME_INSTALL_HINT} Original error: {message}");
1171    }
1172
1173    format!("failed to initialize semantic embedding model: {message}")
1174}
1175
1176/// A chunk of code ready for embedding — derived from a Symbol with context enrichment
1177#[derive(Debug, Clone)]
1178pub struct SemanticChunk {
1179    /// Absolute file path
1180    pub file: PathBuf,
1181    /// Symbol name
1182    pub name: String,
1183    /// Symbol kind (function, class, struct, etc.)
1184    pub kind: SymbolKind,
1185    /// Line range (0-based internally, inclusive)
1186    pub start_line: u32,
1187    pub end_line: u32,
1188    /// Whether the symbol is exported
1189    pub exported: bool,
1190    /// The enriched text that gets embedded (scope + signature + body snippet)
1191    pub embed_text: String,
1192    /// Short code snippet for display in results
1193    pub snippet: String,
1194}
1195
1196/// A stored embedding entry — chunk metadata + vector
1197#[derive(Debug, Clone)]
1198pub struct EmbeddingEntry {
1199    chunk: SemanticChunk,
1200    vector: Vec<f32>,
1201}
1202
1203/// The semantic index — stores embeddings for all symbols in a project
1204#[derive(Debug, Clone)]
1205pub struct SemanticIndex {
1206    entries: Vec<EmbeddingEntry>,
1207    /// Track which files are indexed and their mtime for staleness detection
1208    file_mtimes: HashMap<PathBuf, SystemTime>,
1209    /// Track indexed file sizes alongside mtimes for staleness detection
1210    file_sizes: HashMap<PathBuf, u64>,
1211    file_hashes: HashMap<PathBuf, blake3::Hash>,
1212    /// Embedding dimension (384 for MiniLM-L6-v2)
1213    dimension: usize,
1214    fingerprint: Option<SemanticIndexFingerprint>,
1215    project_root: PathBuf,
1216    deferred_files: HashSet<PathBuf>,
1217}
1218
1219#[derive(Debug, Clone, Copy)]
1220struct IndexedFileMetadata {
1221    mtime: SystemTime,
1222    size: u64,
1223    content_hash: blake3::Hash,
1224}
1225
1226/// Result of an incremental refresh of the semantic index. Counts are file
1227/// counts; `total_processed` is the number of current/deleted files considered.
1228#[derive(Debug, Default, Clone, Copy)]
1229pub struct RefreshSummary {
1230    pub changed: usize,
1231    pub added: usize,
1232    pub deleted: usize,
1233    pub total_processed: usize,
1234}
1235
1236impl RefreshSummary {
1237    /// True when no files were touched.
1238    pub fn is_noop(&self) -> bool {
1239        self.changed == 0 && self.added == 0 && self.deleted == 0
1240    }
1241}
1242
1243#[derive(Debug, Default)]
1244pub struct InvalidatedFilesRefresh {
1245    /// Full replacement entries for `completed_paths`, not just newly embedded
1246    /// chunks. `apply_refresh_update` removes completed paths before extending
1247    /// this set, so reused chunks must travel in this delta too.
1248    pub added_entries: Vec<EmbeddingEntry>,
1249    pub updated_metadata: Vec<(PathBuf, FileFreshness)>,
1250    pub completed_paths: Vec<PathBuf>,
1251    pub summary: RefreshSummary,
1252}
1253
1254#[derive(Debug, Clone)]
1255struct ReusableEmbedding {
1256    embed_text: String,
1257    vector: Vec<f32>,
1258}
1259
1260type ChunkReuseMap = HashMap<PathBuf, HashMap<blake3::Hash, Vec<ReusableEmbedding>>>;
1261
1262/// Search result from a semantic query
1263#[derive(Debug, Clone)]
1264pub struct SemanticResult {
1265    pub file: PathBuf,
1266    pub name: String,
1267    pub kind: SymbolKind,
1268    pub start_line: u32,
1269    pub end_line: u32,
1270    pub exported: bool,
1271    pub snippet: String,
1272    pub score: f32,
1273    pub source: &'static str,
1274}
1275
1276impl SemanticIndex {
1277    pub fn new(project_root: PathBuf, dimension: usize) -> Self {
1278        debug_assert!(project_root.is_absolute());
1279        Self {
1280            entries: Vec::new(),
1281            file_mtimes: HashMap::new(),
1282            file_sizes: HashMap::new(),
1283            file_hashes: HashMap::new(),
1284            dimension,
1285            fingerprint: None,
1286            project_root,
1287            deferred_files: HashSet::new(),
1288        }
1289    }
1290
1291    /// Number of embedded symbol entries.
1292    pub fn entry_count(&self) -> usize {
1293        self.entries.len()
1294    }
1295
1296    /// Number of files currently tracked by the semantic index.
1297    pub fn indexed_file_count(&self) -> usize {
1298        self.file_mtimes.len()
1299    }
1300
1301    /// Human-readable status label for the index.
1302    pub fn status_label(&self) -> &'static str {
1303        if self.entries.is_empty() {
1304            "empty"
1305        } else {
1306            "ready"
1307        }
1308    }
1309
1310    fn collect_chunks(
1311        project_root: &Path,
1312        files: &[PathBuf],
1313    ) -> (Vec<SemanticChunk>, HashMap<PathBuf, IndexedFileMetadata>) {
1314        let collect_started = std::time::Instant::now();
1315        let per_file: Vec<(
1316            PathBuf,
1317            Result<(IndexedFileMetadata, Vec<SemanticChunk>), String>,
1318        )> = files
1319            .par_iter()
1320            .map_init(HashMap::new, |parsers, file| {
1321                let result = collect_file_metadata(file).and_then(|metadata| {
1322                    collect_file_chunks(project_root, file, parsers)
1323                        .map(|chunks| (metadata, chunks))
1324                });
1325                (file.clone(), result)
1326            })
1327            .collect();
1328
1329        let mut chunks: Vec<SemanticChunk> = Vec::new();
1330        let mut file_metadata: HashMap<PathBuf, IndexedFileMetadata> = HashMap::new();
1331
1332        for (file, result) in per_file {
1333            match result {
1334                Ok((metadata, file_chunks)) => {
1335                    file_metadata.insert(file, metadata);
1336                    chunks.extend(file_chunks);
1337                }
1338                Err(error) => {
1339                    // "unsupported file extension" is expected for non-code files
1340                    // (json, xml, .gitignore, etc.) that get included in the
1341                    // project walk. Pre-fix this was swallowed by .unwrap_or_default();
1342                    // we now skip silently to keep the log clean. Only real read/parse
1343                    // errors are worth surfacing.
1344                    if error == "unsupported file extension" {
1345                        continue;
1346                    }
1347                    slog_warn!(
1348                        "failed to collect semantic chunks for {}: {}",
1349                        file.display(),
1350                        error
1351                    );
1352                }
1353            }
1354        }
1355
1356        slog_info!(
1357            "semantic collect: {} chunks from {} files in {} ms",
1358            chunks.len(),
1359            file_metadata.len(),
1360            collect_started.elapsed().as_millis()
1361        );
1362
1363        (chunks, file_metadata)
1364    }
1365
1366    fn build_chunk_reuse_map(&self, files: &[PathBuf]) -> ChunkReuseMap {
1367        let requested: HashSet<&Path> = files.iter().map(PathBuf::as_path).collect();
1368        let mut reuse_map: ChunkReuseMap = HashMap::new();
1369
1370        for entry in &self.entries {
1371            if !requested.contains(entry.chunk.file.as_path()) {
1372                continue;
1373            }
1374
1375            // `embed_text` is already persisted in the current on-disk format,
1376            // so refresh-time reuse can hash it in memory and confirm the exact
1377            // string without bumping `SEMANTIC_INDEX_VERSION` and forcing every
1378            // user through a full rebuild.
1379            let hash = blake3::hash(entry.chunk.embed_text.as_bytes());
1380            reuse_map
1381                .entry(entry.chunk.file.clone())
1382                .or_default()
1383                .entry(hash)
1384                .or_default()
1385                .push(ReusableEmbedding {
1386                    embed_text: entry.chunk.embed_text.clone(),
1387                    vector: entry.vector.clone(),
1388                });
1389        }
1390
1391        reuse_map
1392    }
1393
1394    fn reusable_vector_for_chunk(
1395        reuse_map: &ChunkReuseMap,
1396        chunk: &SemanticChunk,
1397    ) -> Option<Vec<f32>> {
1398        let hash = blake3::hash(chunk.embed_text.as_bytes());
1399        reuse_map
1400            .get(&chunk.file)?
1401            .get(&hash)?
1402            .iter()
1403            .find(|candidate| candidate.embed_text == chunk.embed_text)
1404            .map(|candidate| candidate.vector.clone())
1405    }
1406
1407    fn entries_for_chunks_with_reuse<F, P>(
1408        chunks: Vec<SemanticChunk>,
1409        reuse_map: &ChunkReuseMap,
1410        embed_fn: &mut F,
1411        max_batch_size: usize,
1412        initial_observed_dimension: Option<usize>,
1413        refresh_label: &str,
1414        progress: &mut P,
1415    ) -> Result<(Vec<EmbeddingEntry>, Option<usize>), String>
1416    where
1417        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1418        P: FnMut(usize, usize),
1419    {
1420        let total_chunks = chunks.len();
1421        progress(0, total_chunks);
1422
1423        let mut entries_by_chunk: Vec<Option<EmbeddingEntry>> = vec![None; total_chunks];
1424        let mut misses: Vec<(usize, SemanticChunk)> = Vec::new();
1425
1426        for (chunk_index, chunk) in chunks.into_iter().enumerate() {
1427            if let Some(vector) = Self::reusable_vector_for_chunk(reuse_map, &chunk) {
1428                entries_by_chunk[chunk_index] = Some(EmbeddingEntry { chunk, vector });
1429            } else {
1430                misses.push((chunk_index, chunk));
1431            }
1432        }
1433
1434        let mut completed = total_chunks.saturating_sub(misses.len());
1435        if completed > 0 {
1436            progress(completed, total_chunks);
1437        }
1438
1439        let batch_size = max_batch_size.max(1);
1440        let mut observed_dimension = initial_observed_dimension;
1441
1442        for batch_start in (0..misses.len()).step_by(batch_size) {
1443            let batch_end = (batch_start + batch_size).min(misses.len());
1444            let batch_texts: Vec<String> = misses[batch_start..batch_end]
1445                .iter()
1446                .map(|(_, chunk)| chunk.embed_text.clone())
1447                .collect();
1448
1449            let vectors = embed_fn(batch_texts)?;
1450            validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1451
1452            if let Some(dim) = vectors.first().map(|vector| vector.len()) {
1453                match observed_dimension {
1454                    None => observed_dimension = Some(dim),
1455                    Some(expected) if dim != expected => {
1456                        return Err(format!(
1457                            "embedding dimension changed during {refresh_label}: \
1458                             cached index uses {expected}, new vectors use {dim}"
1459                        ));
1460                    }
1461                    _ => {}
1462                }
1463            }
1464
1465            for (i, vector) in vectors.into_iter().enumerate() {
1466                let (chunk_index, chunk) = misses[batch_start + i].clone();
1467                entries_by_chunk[chunk_index] = Some(EmbeddingEntry { chunk, vector });
1468            }
1469
1470            completed += batch_end - batch_start;
1471            progress(completed, total_chunks);
1472        }
1473
1474        let entries = entries_by_chunk
1475            .into_iter()
1476            .map(|entry| entry.expect("semantic refresh accounted for every chunk"))
1477            .collect();
1478
1479        Ok((entries, observed_dimension))
1480    }
1481
1482    fn build_from_chunks<F, P>(
1483        project_root: &Path,
1484        chunks: Vec<SemanticChunk>,
1485        file_metadata: HashMap<PathBuf, IndexedFileMetadata>,
1486        embed_fn: &mut F,
1487        max_batch_size: usize,
1488        mut progress: Option<&mut P>,
1489    ) -> Result<Self, String>
1490    where
1491        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1492        P: FnMut(usize, usize),
1493    {
1494        debug_assert!(project_root.is_absolute());
1495        let total_chunks = chunks.len();
1496
1497        if chunks.is_empty() {
1498            return Ok(Self {
1499                entries: Vec::new(),
1500                file_mtimes: file_metadata
1501                    .iter()
1502                    .map(|(path, metadata)| (path.clone(), metadata.mtime))
1503                    .collect(),
1504                file_sizes: file_metadata
1505                    .iter()
1506                    .map(|(path, metadata)| (path.clone(), metadata.size))
1507                    .collect(),
1508                file_hashes: file_metadata
1509                    .into_iter()
1510                    .map(|(path, metadata)| (path, metadata.content_hash))
1511                    .collect(),
1512                dimension: DEFAULT_DIMENSION,
1513                fingerprint: None,
1514                project_root: project_root.to_path_buf(),
1515                deferred_files: HashSet::new(),
1516            });
1517        }
1518
1519        // Embed in batches
1520        let mut entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1521        let mut expected_dimension: Option<usize> = None;
1522        let batch_size = max_batch_size.max(1);
1523        let embed_started = std::time::Instant::now();
1524        let batch_count = total_chunks.div_ceil(batch_size);
1525        for batch_start in (0..chunks.len()).step_by(batch_size) {
1526            let batch_end = (batch_start + batch_size).min(chunks.len());
1527            let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1528                .iter()
1529                .map(|c| c.embed_text.clone())
1530                .collect();
1531
1532            let vectors = embed_fn(batch_texts)?;
1533            validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1534
1535            // Track consistent dimension across all batches
1536            if let Some(dim) = vectors.first().map(|v| v.len()) {
1537                match expected_dimension {
1538                    None => expected_dimension = Some(dim),
1539                    Some(expected) if dim != expected => {
1540                        return Err(format!(
1541                            "embedding dimension changed across batches: expected {expected}, got {dim}"
1542                        ));
1543                    }
1544                    _ => {}
1545                }
1546            }
1547
1548            for (i, vector) in vectors.into_iter().enumerate() {
1549                let chunk_idx = batch_start + i;
1550                entries.push(EmbeddingEntry {
1551                    chunk: chunks[chunk_idx].clone(),
1552                    vector,
1553                });
1554            }
1555
1556            if let Some(callback) = progress.as_mut() {
1557                callback(entries.len(), total_chunks);
1558            }
1559        }
1560
1561        let embed_ms = embed_started.elapsed().as_millis();
1562        let rate = (total_chunks as u128 * 1000)
1563            .checked_div(embed_ms)
1564            .unwrap_or(0) as u64;
1565        slog_info!(
1566            "semantic embed: {} chunks in {} batches, {} ms ({} chunks/s)",
1567            total_chunks,
1568            batch_count,
1569            embed_ms,
1570            rate
1571        );
1572
1573        let dimension = entries
1574            .first()
1575            .map(|e| e.vector.len())
1576            .unwrap_or(DEFAULT_DIMENSION);
1577
1578        Ok(Self {
1579            entries,
1580            file_mtimes: file_metadata
1581                .iter()
1582                .map(|(path, metadata)| (path.clone(), metadata.mtime))
1583                .collect(),
1584            file_sizes: file_metadata
1585                .iter()
1586                .map(|(path, metadata)| (path.clone(), metadata.size))
1587                .collect(),
1588            file_hashes: file_metadata
1589                .into_iter()
1590                .map(|(path, metadata)| (path, metadata.content_hash))
1591                .collect(),
1592            dimension,
1593            fingerprint: None,
1594            project_root: project_root.to_path_buf(),
1595            deferred_files: HashSet::new(),
1596        })
1597    }
1598
1599    /// Build the semantic index from a set of files using the provided embedding function.
1600    /// `embed_fn` takes a batch of texts and returns a batch of embedding vectors.
1601    pub fn build<F>(
1602        project_root: &Path,
1603        files: &[PathBuf],
1604        embed_fn: &mut F,
1605        max_batch_size: usize,
1606    ) -> Result<Self, String>
1607    where
1608        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1609    {
1610        let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1611        Self::build_from_chunks(
1612            project_root,
1613            chunks,
1614            file_mtimes,
1615            embed_fn,
1616            max_batch_size,
1617            Option::<&mut fn(usize, usize)>::None,
1618        )
1619    }
1620
1621    /// Build the semantic index and report embedding progress using entry counts.
1622    pub fn build_with_progress<F, P>(
1623        project_root: &Path,
1624        files: &[PathBuf],
1625        embed_fn: &mut F,
1626        max_batch_size: usize,
1627        progress: &mut P,
1628    ) -> Result<Self, String>
1629    where
1630        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1631        P: FnMut(usize, usize),
1632    {
1633        let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1634        let total_chunks = chunks.len();
1635        progress(0, total_chunks);
1636        Self::build_from_chunks(
1637            project_root,
1638            chunks,
1639            file_mtimes,
1640            embed_fn,
1641            max_batch_size,
1642            Some(progress),
1643        )
1644    }
1645
1646    /// Incrementally refresh entries for changed/new files only, preserving cached
1647    /// embeddings for unchanged files. Used when loading the index from disk and
1648    /// finding that a small fraction of files have moved on, deleted, or appeared.
1649    ///
1650    /// Returns `RefreshSummary` describing what changed. On success, `self` is
1651    /// mutated in place and remains a valid index.
1652    ///
1653    /// `current_files` is the full set of files the project considers indexable
1654    /// (typically `walk_project_files(...)`). Files in the cache that are no
1655    /// longer in this set are treated as deleted.
1656    pub fn refresh_stale_files<F, P>(
1657        &mut self,
1658        project_root: &Path,
1659        current_files: &[PathBuf],
1660        embed_fn: &mut F,
1661        max_batch_size: usize,
1662        progress: &mut P,
1663    ) -> Result<RefreshSummary, String>
1664    where
1665        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1666        P: FnMut(usize, usize),
1667    {
1668        self.backfill_missing_file_sizes();
1669
1670        // 1. Bucket files into deleted / changed / added.
1671        let current_set: HashSet<&Path> = current_files.iter().map(PathBuf::as_path).collect();
1672        self.deferred_files
1673            .retain(|path| current_set.contains(path.as_path()));
1674        let total_processed = current_set.len() + self.file_mtimes.len()
1675            - self
1676                .file_mtimes
1677                .keys()
1678                .filter(|path| current_set.contains(path.as_path()))
1679                .count();
1680
1681        // Files in cache that disappeared from disk OR are no longer in the
1682        // walked set. Both cases need their entries dropped.
1683        enum IndexedFileCheck {
1684            Deleted(PathBuf),
1685            MissingMetadata(PathBuf),
1686            Verified(PathBuf, FreshnessVerdict),
1687        }
1688
1689        let mut deleted: Vec<PathBuf> = Vec::new();
1690        let mut changed: Vec<PathBuf> = Vec::new();
1691        let indexed_paths: Vec<PathBuf> = self.file_mtimes.keys().cloned().collect();
1692        let mut checks: Vec<Option<IndexedFileCheck>> = Vec::with_capacity(indexed_paths.len());
1693        let mut strict_verify_inputs: Vec<(usize, PathBuf, FileFreshness)> = Vec::new();
1694
1695        for indexed_path in indexed_paths {
1696            let check_index = checks.len();
1697            if !current_set.contains(indexed_path.as_path()) {
1698                checks.push(Some(IndexedFileCheck::Deleted(indexed_path)));
1699                continue;
1700            }
1701            let cached = match (
1702                self.file_mtimes.get(&indexed_path),
1703                self.file_sizes.get(&indexed_path),
1704                self.file_hashes.get(&indexed_path),
1705            ) {
1706                (Some(mtime), Some(size), Some(hash)) => Some(FileFreshness {
1707                    mtime: *mtime,
1708                    size: *size,
1709                    content_hash: *hash,
1710                }),
1711                _ => None,
1712            };
1713            if let Some(freshness) = cached {
1714                strict_verify_inputs.push((check_index, indexed_path, freshness));
1715                checks.push(None);
1716            } else {
1717                checks.push(Some(IndexedFileCheck::MissingMetadata(indexed_path)));
1718            }
1719        }
1720
1721        for (check_index, path, verdict) in
1722            cache_freshness::verify_files_strict_bounded(strict_verify_inputs)
1723        {
1724            checks[check_index] = Some(IndexedFileCheck::Verified(path, verdict));
1725        }
1726
1727        for check in checks {
1728            match check.expect("strict freshness check should be populated") {
1729                IndexedFileCheck::Deleted(path) => deleted.push(path),
1730                IndexedFileCheck::MissingMetadata(path) => changed.push(path),
1731                IndexedFileCheck::Verified(_path, FreshnessVerdict::HotFresh) => {}
1732                IndexedFileCheck::Verified(
1733                    path,
1734                    FreshnessVerdict::ContentFresh {
1735                        new_mtime,
1736                        new_size,
1737                    },
1738                ) => {
1739                    self.file_mtimes.insert(path.clone(), new_mtime);
1740                    self.file_sizes.insert(path, new_size);
1741                }
1742                IndexedFileCheck::Verified(
1743                    path,
1744                    FreshnessVerdict::Stale | FreshnessVerdict::Deleted,
1745                ) => {
1746                    changed.push(path);
1747                }
1748            }
1749        }
1750
1751        // Files in walk that were never indexed.
1752        let mut added: Vec<PathBuf> = Vec::new();
1753        for path in current_files {
1754            if !self.file_mtimes.contains_key(path) {
1755                added.push(path.clone());
1756            }
1757        }
1758
1759        // Fast path: nothing to do.
1760        if deleted.is_empty() && changed.is_empty() && added.is_empty() {
1761            progress(0, 0);
1762            return Ok(RefreshSummary {
1763                total_processed,
1764                ..RefreshSummary::default()
1765            });
1766        }
1767
1768        // 2. Drop entries for deleted files immediately. Changed files are only
1769        //    replaced after successful re-extraction + embedding so transient
1770        //    read/parse errors keep the stale-but-valid cache entry.
1771        if !deleted.is_empty() {
1772            self.remove_indexed_files(&deleted);
1773        }
1774
1775        // 3. Embed the changed + added set, if any.
1776        let mut to_embed: Vec<PathBuf> = Vec::with_capacity(changed.len() + added.len());
1777        to_embed.extend(changed.iter().cloned());
1778        to_embed.extend(added.iter().cloned());
1779
1780        if to_embed.is_empty() {
1781            // Only deletions happened.
1782            progress(0, 0);
1783            return Ok(RefreshSummary {
1784                changed: 0,
1785                added: 0,
1786                deleted: deleted.len(),
1787                total_processed,
1788            });
1789        }
1790
1791        let reuse_map = self.build_chunk_reuse_map(&changed);
1792        let (chunks, fresh_metadata) = Self::collect_chunks(project_root, &to_embed);
1793        let changed_set: HashSet<&Path> = changed.iter().map(PathBuf::as_path).collect();
1794        let vanished = to_embed
1795            .iter()
1796            .filter(|path| {
1797                changed_set.contains(path.as_path())
1798                    && !fresh_metadata.contains_key(*path)
1799                    && !path.exists()
1800            })
1801            .cloned()
1802            .collect::<Vec<_>>();
1803        if !vanished.is_empty() {
1804            self.remove_indexed_files(&vanished);
1805            deleted.extend(vanished);
1806        }
1807
1808        if chunks.is_empty() {
1809            progress(0, 0);
1810            let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1811            for file in &successful_files {
1812                self.deferred_files.remove(file);
1813            }
1814            if !successful_files.is_empty() {
1815                self.entries
1816                    .retain(|entry| !successful_files.contains(&entry.chunk.file));
1817            }
1818            let changed_count = changed
1819                .iter()
1820                .filter(|path| successful_files.contains(*path))
1821                .count();
1822            let added_count = added
1823                .iter()
1824                .filter(|path| successful_files.contains(*path))
1825                .count();
1826            for (file, metadata) in fresh_metadata {
1827                self.file_mtimes.insert(file.clone(), metadata.mtime);
1828                self.file_sizes.insert(file.clone(), metadata.size);
1829                self.file_hashes.insert(file.clone(), metadata.content_hash);
1830            }
1831            return Ok(RefreshSummary {
1832                changed: changed_count,
1833                added: added_count,
1834                deleted: deleted.len(),
1835                total_processed,
1836            });
1837        }
1838
1839        // 4. Build the full replacement set, reusing cached vectors for chunks
1840        //    whose embed_text is unchanged and embedding only cache misses.
1841        let existing_dimension = if self.entries.is_empty() {
1842            None
1843        } else {
1844            Some(self.dimension)
1845        };
1846        let (new_entries, observed_dimension) = Self::entries_for_chunks_with_reuse(
1847            chunks,
1848            &reuse_map,
1849            embed_fn,
1850            max_batch_size,
1851            existing_dimension,
1852            "incremental refresh",
1853            progress,
1854        )?;
1855
1856        let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1857        for file in &successful_files {
1858            self.deferred_files.remove(file);
1859        }
1860        if !successful_files.is_empty() {
1861            self.entries
1862                .retain(|entry| !successful_files.contains(&entry.chunk.file));
1863        }
1864
1865        self.entries.extend(new_entries);
1866        for (file, metadata) in fresh_metadata {
1867            self.file_mtimes.insert(file.clone(), metadata.mtime);
1868            self.file_sizes.insert(file.clone(), metadata.size);
1869            self.file_hashes.insert(file, metadata.content_hash);
1870        }
1871        if let Some(dim) = observed_dimension {
1872            self.dimension = dim;
1873        }
1874
1875        Ok(RefreshSummary {
1876            changed: changed
1877                .iter()
1878                .filter(|path| successful_files.contains(*path))
1879                .count(),
1880            added: added
1881                .iter()
1882                .filter(|path| successful_files.contains(*path))
1883                .count(),
1884            deleted: deleted.len(),
1885            total_processed,
1886        })
1887    }
1888
1889    /// Refresh exactly the files invalidated by the live watcher, without
1890    /// treating the provided path list as the whole project. This is the
1891    /// watcher-side counterpart to `refresh_stale_files`: it drops any stale
1892    /// entries for the requested paths from this in-memory index, re-extracts
1893    /// whatever still exists on disk, embeds those chunks, and returns the
1894    /// delta needed for another in-memory index to apply the same update.
1895    pub fn refresh_invalidated_files<F, P>(
1896        &mut self,
1897        project_root: &Path,
1898        paths: &[PathBuf],
1899        embed_fn: &mut F,
1900        max_batch_size: usize,
1901        max_files: usize,
1902        progress: &mut P,
1903    ) -> Result<InvalidatedFilesRefresh, String>
1904    where
1905        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1906        P: FnMut(usize, usize),
1907    {
1908        self.backfill_missing_file_sizes();
1909
1910        self.deferred_files.retain(|path| path.exists());
1911        let mut requested_paths = paths.to_vec();
1912        requested_paths.extend(self.deferred_files.iter().cloned());
1913        requested_paths.sort();
1914        requested_paths.dedup();
1915        let total_processed = requested_paths.len();
1916
1917        if requested_paths.is_empty() {
1918            progress(0, 0);
1919            return Ok(InvalidatedFilesRefresh {
1920                summary: RefreshSummary {
1921                    total_processed,
1922                    ..RefreshSummary::default()
1923                },
1924                ..InvalidatedFilesRefresh::default()
1925            });
1926        }
1927
1928        let previously_indexed: HashSet<PathBuf> = requested_paths
1929            .iter()
1930            .filter(|path| self.file_mtimes.contains_key(*path))
1931            .cloned()
1932            .collect();
1933        let reuse_map = self.build_chunk_reuse_map(&requested_paths);
1934
1935        // The watcher path has already invalidated these files in the request
1936        // thread's live index. Mirror that behavior here before inserting any
1937        // fresh chunks so parse/read failures do not resurrect stale entries.
1938        self.remove_indexed_files(&requested_paths);
1939
1940        let existing_paths = requested_paths
1941            .iter()
1942            .filter(|path| path.exists())
1943            .cloned()
1944            .collect::<Vec<_>>();
1945        let deleted = requested_paths
1946            .iter()
1947            .filter(|path| !path.exists() && previously_indexed.contains(path.as_path()))
1948            .count();
1949
1950        if existing_paths.is_empty() {
1951            for path in &requested_paths {
1952                if !path.exists() {
1953                    self.deferred_files.remove(path);
1954                }
1955            }
1956            progress(0, 0);
1957            return Ok(InvalidatedFilesRefresh {
1958                completed_paths: requested_paths,
1959                summary: RefreshSummary {
1960                    deleted,
1961                    total_processed,
1962                    ..RefreshSummary::default()
1963                },
1964                ..InvalidatedFilesRefresh::default()
1965            });
1966        }
1967
1968        let (mut chunks, mut fresh_metadata) = Self::collect_chunks(project_root, &existing_paths);
1969
1970        let retained_file_count = self.file_mtimes.len();
1971        let changed_successful_count = existing_paths
1972            .iter()
1973            .filter(|path| {
1974                previously_indexed.contains(path.as_path()) && fresh_metadata.contains_key(*path)
1975            })
1976            .count();
1977        let available_new_files =
1978            max_files.saturating_sub(retained_file_count.saturating_add(changed_successful_count));
1979        let new_successful_files = existing_paths
1980            .iter()
1981            .filter(|path| {
1982                !previously_indexed.contains(path.as_path()) && fresh_metadata.contains_key(*path)
1983            })
1984            .cloned()
1985            .collect::<Vec<_>>();
1986        if new_successful_files.len() > available_new_files {
1987            let allowed_new_files = new_successful_files
1988                .iter()
1989                .take(available_new_files)
1990                .cloned()
1991                .collect::<HashSet<_>>();
1992            let deferred_new_files = new_successful_files
1993                .into_iter()
1994                .filter(|path| !allowed_new_files.contains(path))
1995                .collect::<HashSet<_>>();
1996
1997            fresh_metadata.retain(|file, _| {
1998                previously_indexed.contains(file.as_path()) || allowed_new_files.contains(file)
1999            });
2000            chunks.retain(|chunk| !deferred_new_files.contains(&chunk.file));
2001
2002            if !deferred_new_files.is_empty() {
2003                for path in &deferred_new_files {
2004                    self.deferred_files.insert(path.clone());
2005                }
2006                slog_warn!(
2007                    "semantic refresh deferred {} new file(s): indexed-file cap {} is reached",
2008                    deferred_new_files.len(),
2009                    max_files
2010                );
2011            }
2012        }
2013
2014        let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
2015        for file in &successful_files {
2016            self.deferred_files.remove(file);
2017        }
2018        let changed = successful_files
2019            .iter()
2020            .filter(|path| previously_indexed.contains(path.as_path()))
2021            .count();
2022        let added = successful_files.len().saturating_sub(changed);
2023        let mut updated_metadata = Vec::with_capacity(fresh_metadata.len());
2024
2025        if chunks.is_empty() {
2026            progress(0, 0);
2027            for (file, metadata) in fresh_metadata {
2028                let freshness = FileFreshness {
2029                    mtime: metadata.mtime,
2030                    size: metadata.size,
2031                    content_hash: metadata.content_hash,
2032                };
2033                self.file_mtimes.insert(file.clone(), freshness.mtime);
2034                self.file_sizes.insert(file.clone(), freshness.size);
2035                self.file_hashes
2036                    .insert(file.clone(), freshness.content_hash);
2037                updated_metadata.push((file, freshness));
2038            }
2039
2040            return Ok(InvalidatedFilesRefresh {
2041                updated_metadata,
2042                completed_paths: requested_paths,
2043                summary: RefreshSummary {
2044                    changed,
2045                    added,
2046                    deleted,
2047                    total_processed,
2048                },
2049                ..InvalidatedFilesRefresh::default()
2050            });
2051        }
2052
2053        let initial_observed_dimension = if self.entries.is_empty() && previously_indexed.is_empty()
2054        {
2055            None
2056        } else {
2057            Some(self.dimension)
2058        };
2059        let (new_entries, observed_dimension) = Self::entries_for_chunks_with_reuse(
2060            chunks,
2061            &reuse_map,
2062            embed_fn,
2063            max_batch_size,
2064            initial_observed_dimension,
2065            "invalidated-file refresh",
2066            progress,
2067        )?;
2068
2069        let added_entries = new_entries.clone();
2070        self.entries.extend(new_entries);
2071        for (file, metadata) in fresh_metadata {
2072            let freshness = FileFreshness {
2073                mtime: metadata.mtime,
2074                size: metadata.size,
2075                content_hash: metadata.content_hash,
2076            };
2077            self.file_mtimes.insert(file.clone(), freshness.mtime);
2078            self.file_sizes.insert(file.clone(), freshness.size);
2079            self.file_hashes
2080                .insert(file.clone(), freshness.content_hash);
2081            updated_metadata.push((file, freshness));
2082        }
2083        if let Some(dim) = observed_dimension {
2084            self.dimension = dim;
2085        }
2086
2087        Ok(InvalidatedFilesRefresh {
2088            added_entries,
2089            updated_metadata,
2090            completed_paths: requested_paths,
2091            summary: RefreshSummary {
2092                changed,
2093                added,
2094                deleted,
2095                total_processed,
2096            },
2097        })
2098    }
2099
2100    pub fn apply_refresh_update(
2101        &mut self,
2102        added_entries: Vec<EmbeddingEntry>,
2103        updated_metadata: Vec<(PathBuf, FileFreshness)>,
2104        completed_paths: &[PathBuf],
2105    ) {
2106        // `added_entries` is the complete replacement set for completed paths:
2107        // freshly embedded misses plus reused chunks carrying refreshed metadata.
2108        // Removing first is safe only because producers include both kinds.
2109        self.remove_indexed_files(completed_paths);
2110
2111        let observed_dimension = added_entries.first().map(|entry| entry.vector.len());
2112        self.entries.extend(added_entries);
2113        for (file, freshness) in updated_metadata {
2114            self.file_mtimes.insert(file.clone(), freshness.mtime);
2115            self.file_sizes.insert(file.clone(), freshness.size);
2116            self.file_hashes.insert(file, freshness.content_hash);
2117        }
2118        if let Some(dim) = observed_dimension {
2119            self.dimension = dim;
2120        }
2121    }
2122
2123    fn remove_indexed_files(&mut self, files: &[PathBuf]) {
2124        let deleted_set: HashSet<&Path> = files.iter().map(PathBuf::as_path).collect();
2125        self.entries
2126            .retain(|entry| !deleted_set.contains(entry.chunk.file.as_path()));
2127        for path in files {
2128            self.file_mtimes.remove(path);
2129            self.file_sizes.remove(path);
2130            self.file_hashes.remove(path);
2131        }
2132    }
2133
2134    /// Search the index with a query embedding, returning top-K results sorted by relevance
2135    pub fn search(&self, query_vector: &[f32], top_k: usize) -> Vec<SemanticResult> {
2136        if self.entries.is_empty() || query_vector.len() != self.dimension {
2137            return Vec::new();
2138        }
2139
2140        let mut scored: Vec<(f32, usize)> = self
2141            .entries
2142            .iter()
2143            .enumerate()
2144            .map(|(i, entry)| {
2145                let mut score = cosine_similarity(query_vector, &entry.vector);
2146                if entry.chunk.exported {
2147                    score *= 1.1;
2148                }
2149                (score, i)
2150            })
2151            .collect();
2152
2153        let keep = top_k.min(scored.len());
2154        if keep == 0 {
2155            return Vec::new();
2156        }
2157
2158        if keep < scored.len() {
2159            scored.select_nth_unstable_by(keep, semantic_score_order);
2160            scored.truncate(keep);
2161        }
2162        scored.sort_by(semantic_score_order);
2163
2164        scored
2165            .into_iter()
2166            // Keep the selected best-first slice mapped without reintroducing the
2167            // old `> 0.0` floor: top_k has already been selected, and zero-score
2168            // tail entries remain observable when requested.
2169            .map(|(score, idx)| {
2170                let entry = &self.entries[idx];
2171                SemanticResult {
2172                    file: entry.chunk.file.clone(),
2173                    name: entry.chunk.name.clone(),
2174                    kind: entry.chunk.kind.clone(),
2175                    start_line: entry.chunk.start_line,
2176                    end_line: entry.chunk.end_line,
2177                    exported: entry.chunk.exported,
2178                    snippet: entry.chunk.snippet.clone(),
2179                    score,
2180                    source: "semantic",
2181                }
2182            })
2183            .collect()
2184    }
2185
2186    /// Number of indexed entries
2187    pub fn len(&self) -> usize {
2188        self.entries.len()
2189    }
2190
2191    /// Check if a file needs re-indexing based on mtime/size
2192    pub fn is_file_stale(&self, file: &Path) -> bool {
2193        let Some(stored_mtime) = self.file_mtimes.get(file) else {
2194            return true;
2195        };
2196        let Some(stored_size) = self.file_sizes.get(file) else {
2197            return true;
2198        };
2199        let Some(stored_hash) = self.file_hashes.get(file) else {
2200            return true;
2201        };
2202        let cached = FileFreshness {
2203            mtime: *stored_mtime,
2204            size: *stored_size,
2205            content_hash: *stored_hash,
2206        };
2207        match cache_freshness::verify_file_strict(file, &cached) {
2208            FreshnessVerdict::HotFresh => false,
2209            FreshnessVerdict::ContentFresh { .. } => false,
2210            FreshnessVerdict::Stale | FreshnessVerdict::Deleted => true,
2211        }
2212    }
2213
2214    fn backfill_missing_file_sizes(&mut self) {
2215        for path in self.file_mtimes.keys() {
2216            if self.file_sizes.contains_key(path) {
2217                continue;
2218            }
2219            if let Ok(metadata) = fs::metadata(path) {
2220                self.file_sizes.insert(path.clone(), metadata.len());
2221                if let Ok(Some(hash)) = cache_freshness::hash_file_if_small(path, metadata.len()) {
2222                    self.file_hashes.insert(path.clone(), hash);
2223                }
2224            }
2225        }
2226    }
2227
2228    /// Remove entries for a specific file
2229    pub fn remove_file(&mut self, file: &Path) {
2230        self.invalidate_file(file);
2231    }
2232
2233    pub fn invalidate_file(&mut self, file: &Path) {
2234        let canonical_file = canonicalize_existing_or_deleted_path(file);
2235        self.entries
2236            .retain(|e| e.chunk.file != file && e.chunk.file != canonical_file);
2237        self.file_mtimes.remove(file);
2238        self.file_sizes.remove(file);
2239        self.file_hashes.remove(file);
2240        if canonical_file.as_path() != file {
2241            self.file_mtimes.remove(&canonical_file);
2242            self.file_sizes.remove(&canonical_file);
2243            self.file_hashes.remove(&canonical_file);
2244        }
2245    }
2246
2247    /// Get the embedding dimension
2248    pub fn dimension(&self) -> usize {
2249        self.dimension
2250    }
2251
2252    pub fn fingerprint(&self) -> Option<&SemanticIndexFingerprint> {
2253        self.fingerprint.as_ref()
2254    }
2255
2256    pub fn backend_label(&self) -> Option<&str> {
2257        self.fingerprint.as_ref().map(|f| f.backend.as_str())
2258    }
2259
2260    pub fn model_label(&self) -> Option<&str> {
2261        self.fingerprint.as_ref().map(|f| f.model.as_str())
2262    }
2263
2264    pub fn set_fingerprint(&mut self, fingerprint: SemanticIndexFingerprint) {
2265        self.fingerprint = Some(fingerprint);
2266    }
2267
2268    /// Write the semantic index to disk using atomic temp+rename pattern
2269    pub fn write_to_disk(&self, storage_dir: &Path, project_key: &str) {
2270        // Don't persist empty indexes — they would be loaded on next startup
2271        // and prevent a fresh build that might find files.
2272        if self.entries.is_empty() {
2273            slog_info!("skipping semantic index persistence (0 entries)");
2274            return;
2275        }
2276        let dir = storage_dir.join("semantic").join(project_key);
2277        if let Err(e) = fs::create_dir_all(&dir) {
2278            slog_warn!("failed to create semantic cache dir: {}", e);
2279            return;
2280        }
2281        let data_path = dir.join("semantic.bin");
2282        let tmp_path = dir.join(format!(
2283            "semantic.bin.tmp.{}.{}",
2284            std::process::id(),
2285            SystemTime::now()
2286                .duration_since(SystemTime::UNIX_EPOCH)
2287                .unwrap_or(Duration::ZERO)
2288                .as_nanos()
2289        ));
2290        let write_result = (|| -> io::Result<usize> {
2291            let file = fs::File::create(&tmp_path)?;
2292            let mut writer = BufWriter::new(file);
2293            let bytes_written = self.write_to_writer(&mut writer)?;
2294            writer.flush()?;
2295            writer.get_ref().sync_all()?;
2296            Ok(bytes_written)
2297        })();
2298        let bytes_written = match write_result {
2299            Ok(bytes_written) => bytes_written,
2300            Err(e) => {
2301                slog_warn!("failed to write semantic index: {}", e);
2302                let _ = fs::remove_file(&tmp_path);
2303                return;
2304            }
2305        };
2306        if let Err(e) = fs::rename(&tmp_path, &data_path) {
2307            slog_warn!("failed to rename semantic index: {}", e);
2308            let _ = fs::remove_file(&tmp_path);
2309            return;
2310        }
2311        slog_info!(
2312            "semantic index persisted: {} entries, {:.1} KB",
2313            self.entries.len(),
2314            bytes_written as f64 / 1024.0
2315        );
2316    }
2317
2318    /// Read the semantic index from disk
2319    pub fn read_from_disk(
2320        storage_dir: &Path,
2321        project_key: &str,
2322        current_canonical_root: &Path,
2323        is_worktree_bridge: bool,
2324        expected_fingerprint: Option<&str>,
2325    ) -> Option<Self> {
2326        debug_assert!(current_canonical_root.is_absolute());
2327        let data_path = storage_dir
2328            .join("semantic")
2329            .join(project_key)
2330            .join("semantic.bin");
2331        let file = fs::File::open(&data_path).ok()?;
2332        let file_len = usize::try_from(file.metadata().ok()?.len()).ok()?;
2333        if file_len < HEADER_BYTES_V1 {
2334            slog_warn!(
2335                "corrupt semantic index (too small: {} bytes), removing",
2336                file_len
2337            );
2338            if !is_worktree_bridge {
2339                let _ = fs::remove_file(&data_path);
2340            }
2341            return None;
2342        }
2343
2344        let mut reader = BufReader::new(file);
2345        let mut version_buf = [0u8; 1];
2346        reader.read_exact(&mut version_buf).ok()?;
2347        let version = version_buf[0];
2348        if version != SEMANTIC_INDEX_VERSION_V6 {
2349            slog_info!(
2350                "cached semantic index version {} is older than {}, rebuilding",
2351                version,
2352                SEMANTIC_INDEX_VERSION_V6
2353            );
2354            if !is_worktree_bridge {
2355                let _ = fs::remove_file(&data_path);
2356            }
2357            return None;
2358        }
2359        match Self::from_reader_after_version(
2360            reader,
2361            version,
2362            current_canonical_root,
2363            Some(file_len),
2364            1,
2365        ) {
2366            Ok(index) => {
2367                if index.entries.is_empty() {
2368                    slog_info!("cached semantic index is empty, will rebuild");
2369                    if !is_worktree_bridge {
2370                        let _ = fs::remove_file(&data_path);
2371                    }
2372                    return None;
2373                }
2374                if let Some(expected) = expected_fingerprint {
2375                    let matches = index
2376                        .fingerprint()
2377                        .map(|fingerprint| fingerprint.matches_expected(expected))
2378                        .unwrap_or(false);
2379                    if !matches {
2380                        slog_info!("cached semantic index fingerprint mismatch, rebuilding");
2381                        if !is_worktree_bridge {
2382                            let _ = fs::remove_file(&data_path);
2383                        }
2384                        return None;
2385                    }
2386                }
2387                slog_info!(
2388                    "loaded semantic index from disk: {} entries",
2389                    index.entries.len()
2390                );
2391                Some(index)
2392            }
2393            Err(e) => {
2394                slog_warn!("corrupt semantic index, rebuilding: {}", e);
2395                if !is_worktree_bridge {
2396                    let _ = fs::remove_file(&data_path);
2397                }
2398                None
2399            }
2400        }
2401    }
2402
2403    /// Serialize the index to bytes for disk persistence
2404    pub fn to_bytes(&self) -> Vec<u8> {
2405        let mut buf = Vec::new();
2406        self.write_to_writer(&mut buf)
2407            .expect("writing semantic index to Vec cannot fail");
2408        buf
2409    }
2410
2411    fn write_to_writer<W: Write>(&self, writer: &mut W) -> io::Result<usize> {
2412        let mut bytes_written = 0usize;
2413        let fingerprint = self.fingerprint.as_ref().and_then(|fingerprint| {
2414            let encoded = fingerprint.as_string();
2415            if encoded.is_empty() {
2416                None
2417            } else {
2418                Some(encoded)
2419            }
2420        });
2421        let fp_bytes_ref = fingerprint.as_deref().map(str::as_bytes).unwrap_or(&[]);
2422        let file_mtime_count = self
2423            .file_mtimes
2424            .iter()
2425            .filter(|(path, _)| cache_relative_path(&self.project_root, path).is_some())
2426            .count();
2427        let entry_count = self
2428            .entries
2429            .iter()
2430            .filter(|entry| cache_relative_path(&self.project_root, &entry.chunk.file).is_some())
2431            .count();
2432
2433        // Header: version(1) + dimension(4) + entry_count(4) + fingerprint_len(4) + fingerprint
2434        //
2435        // V6 is the single write format. Layout extends V5:
2436        //   - fingerprint is always represented (absent ⇒ fingerprint_len=0,
2437        //     no bytes follow). Uniform format simplifies the reader.
2438        //   - paths are relative to project_root.
2439        //   - file metadata stored as secs(u64) + subsec_nanos(u32) + size(u64) + blake3(32).
2440        //     Preserves full APFS/ext4/NTFS precision and catches mtime ties.
2441        //
2442        // V1/V2 remain readable for backward compatibility (see from_bytes).
2443        // V3/V4 load as compatible formats but are rejected on disk so snippets
2444        // and file sizes are rebuilt once.
2445        let version = SEMANTIC_INDEX_VERSION_V6;
2446        write_counted(writer, &[version], &mut bytes_written)?;
2447        write_counted(
2448            writer,
2449            &(self.dimension as u32).to_le_bytes(),
2450            &mut bytes_written,
2451        )?;
2452        write_counted(
2453            writer,
2454            &(entry_count as u32).to_le_bytes(),
2455            &mut bytes_written,
2456        )?;
2457        write_counted(
2458            writer,
2459            &(fp_bytes_ref.len() as u32).to_le_bytes(),
2460            &mut bytes_written,
2461        )?;
2462        write_counted(writer, fp_bytes_ref, &mut bytes_written)?;
2463
2464        // File mtime table: count(4) + entries
2465        // V3 layout per entry: path_len(4) + path + secs(8) + subsec_nanos(4)
2466        write_counted(
2467            writer,
2468            &(file_mtime_count as u32).to_le_bytes(),
2469            &mut bytes_written,
2470        )?;
2471        for (path, mtime) in &self.file_mtimes {
2472            let Some(relative) = cache_relative_path(&self.project_root, path) else {
2473                continue;
2474            };
2475            let relative = relative.to_string_lossy();
2476            let path_bytes = relative.as_bytes();
2477            write_counted(
2478                writer,
2479                &(path_bytes.len() as u32).to_le_bytes(),
2480                &mut bytes_written,
2481            )?;
2482            write_counted(writer, path_bytes, &mut bytes_written)?;
2483            let duration = mtime
2484                .duration_since(SystemTime::UNIX_EPOCH)
2485                .unwrap_or_default();
2486            write_counted(
2487                writer,
2488                &duration.as_secs().to_le_bytes(),
2489                &mut bytes_written,
2490            )?;
2491            write_counted(
2492                writer,
2493                &duration.subsec_nanos().to_le_bytes(),
2494                &mut bytes_written,
2495            )?;
2496            let size = self.file_sizes.get(path).copied().unwrap_or_default();
2497            write_counted(writer, &size.to_le_bytes(), &mut bytes_written)?;
2498            let hash = self
2499                .file_hashes
2500                .get(path)
2501                .copied()
2502                .unwrap_or_else(cache_freshness::zero_hash);
2503            write_counted(writer, hash.as_bytes(), &mut bytes_written)?;
2504        }
2505
2506        // Entries: each is metadata + vector
2507        for entry in &self.entries {
2508            let Some(relative) = cache_relative_path(&self.project_root, &entry.chunk.file) else {
2509                continue;
2510            };
2511            let c = &entry.chunk;
2512
2513            // File path
2514            let relative = relative.to_string_lossy();
2515            let file_bytes = relative.as_bytes();
2516            write_counted(
2517                writer,
2518                &(file_bytes.len() as u32).to_le_bytes(),
2519                &mut bytes_written,
2520            )?;
2521            write_counted(writer, file_bytes, &mut bytes_written)?;
2522
2523            // Name
2524            let name_bytes = c.name.as_bytes();
2525            write_counted(
2526                writer,
2527                &(name_bytes.len() as u32).to_le_bytes(),
2528                &mut bytes_written,
2529            )?;
2530            write_counted(writer, name_bytes, &mut bytes_written)?;
2531
2532            // Kind (1 byte)
2533            write_counted(writer, &[symbol_kind_to_u8(&c.kind)], &mut bytes_written)?;
2534
2535            // Lines + exported
2536            write_counted(
2537                writer,
2538                &(c.start_line as u32).to_le_bytes(),
2539                &mut bytes_written,
2540            )?;
2541            write_counted(
2542                writer,
2543                &(c.end_line as u32).to_le_bytes(),
2544                &mut bytes_written,
2545            )?;
2546            write_counted(writer, &[c.exported as u8], &mut bytes_written)?;
2547
2548            // Snippet
2549            let snippet_bytes = c.snippet.as_bytes();
2550            write_counted(
2551                writer,
2552                &(snippet_bytes.len() as u32).to_le_bytes(),
2553                &mut bytes_written,
2554            )?;
2555            write_counted(writer, snippet_bytes, &mut bytes_written)?;
2556
2557            // Embed text
2558            let embed_bytes = c.embed_text.as_bytes();
2559            write_counted(
2560                writer,
2561                &(embed_bytes.len() as u32).to_le_bytes(),
2562                &mut bytes_written,
2563            )?;
2564            write_counted(writer, embed_bytes, &mut bytes_written)?;
2565
2566            // Vector (f32 array)
2567            for &val in &entry.vector {
2568                write_counted(writer, &val.to_le_bytes(), &mut bytes_written)?;
2569            }
2570        }
2571
2572        Ok(bytes_written)
2573    }
2574
2575    /// Deserialize the index from bytes
2576    pub fn from_bytes(data: &[u8], current_canonical_root: &Path) -> Result<Self, String> {
2577        debug_assert!(current_canonical_root.is_absolute());
2578        if data.len() < HEADER_BYTES_V1 {
2579            return Err("data too short".to_string());
2580        }
2581
2582        Self::from_reader_after_version(
2583            Cursor::new(&data[1..]),
2584            data[0],
2585            current_canonical_root,
2586            Some(data.len()),
2587            1,
2588        )
2589    }
2590
2591    fn from_reader_after_version<R: Read>(
2592        reader: R,
2593        version: u8,
2594        current_canonical_root: &Path,
2595        total_len: Option<usize>,
2596        bytes_read: usize,
2597    ) -> Result<Self, String> {
2598        debug_assert!(current_canonical_root.is_absolute());
2599        let mut reader = CountingReader::with_bytes_read(reader, bytes_read);
2600
2601        if version != SEMANTIC_INDEX_VERSION_V1
2602            && version != SEMANTIC_INDEX_VERSION_V2
2603            && version != SEMANTIC_INDEX_VERSION_V3
2604            && version != SEMANTIC_INDEX_VERSION_V4
2605            && version != SEMANTIC_INDEX_VERSION_V5
2606            && version != SEMANTIC_INDEX_VERSION_V6
2607        {
2608            return Err(format!("unsupported version: {}", version));
2609        }
2610        // V2 and newer share the same header layout (V3/V4/V5 only differ from
2611        // V2 in the per-mtime entry layout): version(1) + dimension(4) +
2612        // entry_count(4) + fingerprint_len(4) + fingerprint bytes.
2613        if (version == SEMANTIC_INDEX_VERSION_V2
2614            || version == SEMANTIC_INDEX_VERSION_V3
2615            || version == SEMANTIC_INDEX_VERSION_V4
2616            || version == SEMANTIC_INDEX_VERSION_V5
2617            || version == SEMANTIC_INDEX_VERSION_V6)
2618            && total_len.is_some_and(|len| len < HEADER_BYTES_V2)
2619        {
2620            return Err("data too short for semantic index v2/v3/v4/v5/v6 header".to_string());
2621        }
2622
2623        let dimension = read_u32_stream(&mut reader)? as usize;
2624        let entry_count = read_u32_stream(&mut reader)? as usize;
2625        validate_embedding_dimension(dimension)?;
2626        if entry_count > MAX_ENTRIES {
2627            return Err(format!("too many semantic index entries: {}", entry_count));
2628        }
2629
2630        // Fingerprint handling:
2631        //   - V1: no fingerprint field at all.
2632        //   - V2: fingerprint_len + fingerprint bytes; always present (writer
2633        //     only emitted V2 when fingerprint was Some).
2634        //   - V3+: fingerprint_len always present; fingerprint_len==0 ⇒ None.
2635        let has_fingerprint_field = version == SEMANTIC_INDEX_VERSION_V2
2636            || version == SEMANTIC_INDEX_VERSION_V3
2637            || version == SEMANTIC_INDEX_VERSION_V4
2638            || version == SEMANTIC_INDEX_VERSION_V5
2639            || version == SEMANTIC_INDEX_VERSION_V6;
2640        let fingerprint = if has_fingerprint_field {
2641            let fingerprint_len = read_u32_stream(&mut reader)? as usize;
2642            if total_len
2643                .is_some_and(|len| reader.bytes_read().saturating_add(fingerprint_len) > len)
2644            {
2645                return Err("unexpected end of data reading fingerprint".to_string());
2646            }
2647            if fingerprint_len == 0 {
2648                None
2649            } else {
2650                let mut raw = vec![0u8; fingerprint_len];
2651                read_exact_stream(
2652                    &mut reader,
2653                    &mut raw,
2654                    "unexpected end of data reading fingerprint",
2655                )?;
2656                let raw = String::from_utf8_lossy(&raw).to_string();
2657                Some(
2658                    serde_json::from_str::<SemanticIndexFingerprint>(&raw)
2659                        .map_err(|error| format!("invalid semantic fingerprint: {error}"))?,
2660                )
2661            }
2662        } else {
2663            None
2664        };
2665
2666        // File mtimes
2667        let mtime_count = read_u32_stream(&mut reader)? as usize;
2668        if mtime_count > MAX_ENTRIES {
2669            return Err(format!("too many semantic file mtimes: {}", mtime_count));
2670        }
2671
2672        let vector_bytes = entry_count
2673            .checked_mul(dimension)
2674            .and_then(|count| count.checked_mul(F32_BYTES))
2675            .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
2676        if total_len.is_some_and(|len| vector_bytes > len.saturating_sub(reader.bytes_read())) {
2677            return Err("semantic index vectors exceed available data".to_string());
2678        }
2679
2680        let mut file_mtimes = HashMap::with_capacity(mtime_count);
2681        let mut file_sizes = HashMap::with_capacity(mtime_count);
2682        let mut file_hashes = HashMap::with_capacity(mtime_count);
2683        for _ in 0..mtime_count {
2684            let path = read_string_stream(&mut reader, total_len)?;
2685            let secs = read_u64_stream(&mut reader)?;
2686            // V3+ persists subsec_nanos alongside secs so staleness checks
2687            // survive restart round-trips. V1/V2 load with 0 nanos, which
2688            // causes one rebuild on upgrade (they never matched live APFS
2689            // mtimes anyway — the bug v0.15.2 fixes). After that rebuild,
2690            // the cache is persisted as V3 and stabilises.
2691            let nanos = if version == SEMANTIC_INDEX_VERSION_V3
2692                || version == SEMANTIC_INDEX_VERSION_V4
2693                || version == SEMANTIC_INDEX_VERSION_V5
2694                || version == SEMANTIC_INDEX_VERSION_V6
2695            {
2696                read_u32_stream(&mut reader)?
2697            } else {
2698                0
2699            };
2700            let size =
2701                if version == SEMANTIC_INDEX_VERSION_V5 || version == SEMANTIC_INDEX_VERSION_V6 {
2702                    read_u64_stream(&mut reader)?
2703                } else {
2704                    0
2705                };
2706            let content_hash = if version == SEMANTIC_INDEX_VERSION_V6 {
2707                let mut hash_bytes = [0u8; 32];
2708                read_exact_stream(
2709                    &mut reader,
2710                    &mut hash_bytes,
2711                    "unexpected end of data reading content hash",
2712                )?;
2713                blake3::Hash::from_bytes(hash_bytes)
2714            } else {
2715                cache_freshness::zero_hash()
2716            };
2717            // Hardening against corrupt / maliciously crafted cache files
2718            // (v0.15.2). `Duration::new(secs, nanos)` can panic when the
2719            // nanosecond carry overflows the second counter, and
2720            // `SystemTime + Duration` can panic on carry past the platform's
2721            // upper bound. Explicit validation keeps a corrupted semantic.bin
2722            // from taking down the whole aft process.
2723            if nanos >= 1_000_000_000 {
2724                return Err(format!(
2725                    "invalid semantic mtime: nanos {} >= 1_000_000_000",
2726                    nanos
2727                ));
2728            }
2729            let duration = std::time::Duration::new(secs, nanos);
2730            let mtime = SystemTime::UNIX_EPOCH
2731                .checked_add(duration)
2732                .ok_or_else(|| {
2733                    format!(
2734                        "invalid semantic mtime: secs={} nanos={} overflows SystemTime",
2735                        secs, nanos
2736                    )
2737                })?;
2738            let path = if version == SEMANTIC_INDEX_VERSION_V6 {
2739                cached_path_under_root(current_canonical_root, &PathBuf::from(path))
2740                    .ok_or_else(|| "cached semantic mtime path escapes project root".to_string())?
2741            } else {
2742                PathBuf::from(path)
2743            };
2744            file_mtimes.insert(path.clone(), mtime);
2745            file_sizes.insert(path.clone(), size);
2746            file_hashes.insert(path, content_hash);
2747        }
2748
2749        // Entries
2750        let mut entries = Vec::with_capacity(entry_count);
2751        for _ in 0..entry_count {
2752            let raw_file = PathBuf::from(read_string_stream(&mut reader, total_len)?);
2753            let file = if version == SEMANTIC_INDEX_VERSION_V6 {
2754                cached_path_under_root(current_canonical_root, &raw_file)
2755                    .ok_or_else(|| "cached semantic entry path escapes project root".to_string())?
2756            } else {
2757                raw_file
2758            };
2759            let name = read_string_stream(&mut reader, total_len)?;
2760
2761            let kind = u8_to_symbol_kind(read_u8_stream(&mut reader, "unexpected end of data")?);
2762
2763            let start_line = read_u32_stream(&mut reader)?;
2764            let end_line = read_u32_stream(&mut reader)?;
2765
2766            let exported = read_u8_stream(&mut reader, "unexpected end of data")? != 0;
2767
2768            let snippet = read_string_stream(&mut reader, total_len)?;
2769            let embed_text = read_string_stream(&mut reader, total_len)?;
2770
2771            // Vector
2772            let vec_bytes = dimension
2773                .checked_mul(F32_BYTES)
2774                .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
2775            if total_len.is_some_and(|len| reader.bytes_read().saturating_add(vec_bytes) > len) {
2776                return Err("unexpected end of data reading vector".to_string());
2777            }
2778            let mut vector = Vec::with_capacity(dimension);
2779            for _ in 0..dimension {
2780                let mut bytes = [0u8; F32_BYTES];
2781                read_exact_stream(
2782                    &mut reader,
2783                    &mut bytes,
2784                    "unexpected end of data reading vector",
2785                )?;
2786                vector.push(f32::from_le_bytes(bytes));
2787            }
2788
2789            entries.push(EmbeddingEntry {
2790                chunk: SemanticChunk {
2791                    file,
2792                    name,
2793                    kind,
2794                    start_line,
2795                    end_line,
2796                    exported,
2797                    embed_text,
2798                    snippet,
2799                },
2800                vector,
2801            });
2802        }
2803
2804        if entries.len() != entry_count {
2805            return Err(format!(
2806                "semantic cache entry count drift: header={} decoded={}",
2807                entry_count,
2808                entries.len()
2809            ));
2810        }
2811        for entry in &entries {
2812            if !file_mtimes.contains_key(&entry.chunk.file) {
2813                return Err(format!(
2814                    "semantic cache metadata missing for entry file {}",
2815                    entry.chunk.file.display()
2816                ));
2817            }
2818        }
2819
2820        Ok(Self {
2821            entries,
2822            file_mtimes,
2823            file_sizes,
2824            file_hashes,
2825            dimension,
2826            fingerprint,
2827            project_root: current_canonical_root.to_path_buf(),
2828            deferred_files: HashSet::new(),
2829        })
2830    }
2831}
2832
2833fn write_counted<W: Write>(
2834    writer: &mut W,
2835    bytes: &[u8],
2836    bytes_written: &mut usize,
2837) -> io::Result<()> {
2838    writer.write_all(bytes)?;
2839    *bytes_written = bytes_written.saturating_add(bytes.len());
2840    Ok(())
2841}
2842
2843struct CountingReader<R> {
2844    inner: R,
2845    bytes_read: usize,
2846}
2847
2848impl<R> CountingReader<R> {
2849    fn with_bytes_read(inner: R, bytes_read: usize) -> Self {
2850        Self { inner, bytes_read }
2851    }
2852
2853    fn bytes_read(&self) -> usize {
2854        self.bytes_read
2855    }
2856}
2857
2858impl<R: Read> Read for CountingReader<R> {
2859    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
2860        let read = self.inner.read(buf)?;
2861        self.bytes_read = self.bytes_read.saturating_add(read);
2862        Ok(read)
2863    }
2864}
2865
2866fn read_exact_stream<R: Read>(
2867    reader: &mut CountingReader<R>,
2868    buf: &mut [u8],
2869    eof_message: &'static str,
2870) -> Result<(), String> {
2871    reader.read_exact(buf).map_err(|error| {
2872        if error.kind() == io::ErrorKind::UnexpectedEof {
2873            eof_message.to_string()
2874        } else {
2875            format!("{eof_message}: {error}")
2876        }
2877    })
2878}
2879
2880fn read_u8_stream<R: Read>(
2881    reader: &mut CountingReader<R>,
2882    eof_message: &'static str,
2883) -> Result<u8, String> {
2884    let mut bytes = [0u8; 1];
2885    read_exact_stream(reader, &mut bytes, eof_message)?;
2886    Ok(bytes[0])
2887}
2888
2889fn read_u32_stream<R: Read>(reader: &mut CountingReader<R>) -> Result<u32, String> {
2890    let mut bytes = [0u8; 4];
2891    read_exact_stream(reader, &mut bytes, "unexpected end of data reading u32")?;
2892    Ok(u32::from_le_bytes(bytes))
2893}
2894
2895fn read_u64_stream<R: Read>(reader: &mut CountingReader<R>) -> Result<u64, String> {
2896    let mut bytes = [0u8; 8];
2897    read_exact_stream(reader, &mut bytes, "unexpected end of data reading u64")?;
2898    Ok(u64::from_le_bytes(bytes))
2899}
2900
2901fn read_string_stream<R: Read>(
2902    reader: &mut CountingReader<R>,
2903    total_len: Option<usize>,
2904) -> Result<String, String> {
2905    let len = read_u32_stream(reader)? as usize;
2906    if total_len.is_some_and(|total_len| reader.bytes_read().saturating_add(len) > total_len) {
2907        return Err("unexpected end of data reading string".to_string());
2908    }
2909    let mut bytes = vec![0u8; len];
2910    read_exact_stream(reader, &mut bytes, "unexpected end of data reading string")?;
2911    Ok(String::from_utf8_lossy(&bytes).to_string())
2912}
2913
2914/// Build enriched embedding text from a symbol with cAST-style context
2915fn build_embed_text(symbol: &Symbol, source: &str, file: &Path, project_root: &Path) -> String {
2916    let relative = file
2917        .strip_prefix(project_root)
2918        .unwrap_or(file)
2919        .to_string_lossy();
2920
2921    let kind_label = match &symbol.kind {
2922        SymbolKind::Function => "function",
2923        SymbolKind::Class => "class",
2924        SymbolKind::Method => "method",
2925        SymbolKind::Struct => "struct",
2926        SymbolKind::Interface => "interface",
2927        SymbolKind::Enum => "enum",
2928        SymbolKind::TypeAlias => "type",
2929        SymbolKind::Variable => "variable",
2930        SymbolKind::Heading => "heading",
2931        SymbolKind::FileSummary => "file-summary",
2932    };
2933
2934    // Build: "file:relative/path kind:function name:validateAuth signature:fn validateAuth(token: &str) -> bool"
2935    let name = &symbol.name;
2936    let mut text = format!(
2937        "name:{name} file:{} kind:{} name:{name}",
2938        relative, kind_label
2939    );
2940
2941    if let Some(sig) = &symbol.signature {
2942        // Cap the signature: structured parsers (e.g. YAML/Kubernetes) pack
2943        // entire inline scripts (CronJob/Job `command:` bodies, multi-KB) into
2944        // the signature. Appending it unbounded produces a single embed_text
2945        // that overflows the embedding backend's physical batch (e.g. a
2946        // llama.cpp server's 512-token cap), aborting the whole index build
2947        // and silently degrading every search to lexical. 400 chars keeps the
2948        // identifying head of the signature without blowing the budget.
2949        text.push_str(&format!(" signature:{}", truncate_chars(sig, 400)));
2950    }
2951
2952    // Add body snippet (first ~300 chars of symbol body)
2953    let lines: Vec<&str> = source.lines().collect();
2954    let start = (symbol.range.start_line as usize).min(lines.len());
2955    // range.end_line is inclusive 0-based; +1 makes it an exclusive slice bound.
2956    let end = (symbol.range.end_line as usize + 1).min(lines.len());
2957    if start < end {
2958        let body: String = lines[start..end]
2959            .iter()
2960            .take(15) // max 15 lines
2961            .copied()
2962            .collect::<Vec<&str>>()
2963            .join("\n");
2964        let snippet = if body.len() > 300 {
2965            format!("{}...", &body[..body.floor_char_boundary(300)])
2966        } else {
2967            body
2968        };
2969        text.push_str(&format!(" body:{}", snippet));
2970    }
2971
2972    // Final defense-in-depth clamp: no single embed_text may exceed the
2973    // backend's per-input budget regardless of which field grew. Most
2974    // backends cap a physical batch around 512 tokens; ~1600 chars stays
2975    // comfortably under that for typical English/code (≈4 chars/token).
2976    truncate_chars(&text, MAX_EMBED_TEXT_CHARS)
2977}
2978
2979/// Upper bound on characters in a single chunk's `embed_text`. Keeps any one
2980/// input below typical embedding-backend physical batch limits (~512 tokens)
2981/// so an oversized symbol cannot abort the whole index build.
2982const MAX_EMBED_TEXT_CHARS: usize = 1600;
2983
2984fn truncate_chars(value: &str, max_chars: usize) -> String {
2985    value.chars().take(max_chars).collect()
2986}
2987
2988fn first_leading_doc_comment(source: &str) -> String {
2989    let lines: Vec<&str> = source.lines().collect();
2990    let Some((start, first)) = lines
2991        .iter()
2992        .enumerate()
2993        .find(|(_, line)| !line.trim().is_empty())
2994    else {
2995        return String::new();
2996    };
2997
2998    let trimmed = first.trim_start();
2999    if trimmed.starts_with("/**") {
3000        let mut comment = Vec::new();
3001        for line in lines.iter().skip(start) {
3002            comment.push(*line);
3003            if line.contains("*/") {
3004                break;
3005            }
3006        }
3007        return truncate_chars(&comment.join("\n"), 200);
3008    }
3009
3010    if trimmed.starts_with("///") || trimmed.starts_with("//!") {
3011        let comment = lines
3012            .iter()
3013            .skip(start)
3014            .take_while(|line| {
3015                let trimmed = line.trim_start();
3016                trimmed.starts_with("///") || trimmed.starts_with("//!")
3017            })
3018            .copied()
3019            .collect::<Vec<_>>()
3020            .join("\n");
3021        return truncate_chars(&comment, 200);
3022    }
3023
3024    String::new()
3025}
3026
3027pub fn build_file_summary_chunk(
3028    file: &Path,
3029    project_root: &Path,
3030    source: &str,
3031    top_exports: &[&str],
3032    top_export_signatures: &[Option<&str>],
3033) -> SemanticChunk {
3034    let relative = file.strip_prefix(project_root).unwrap_or(file);
3035    let rel_path = relative.to_string_lossy();
3036    let parent_dir = relative
3037        .parent()
3038        .map(|parent| parent.to_string_lossy().to_string())
3039        .unwrap_or_default();
3040    let name = file
3041        .file_stem()
3042        .map(|stem| stem.to_string_lossy().to_string())
3043        .unwrap_or_default();
3044    let doc = first_leading_doc_comment(source);
3045    let exports = top_exports
3046        .iter()
3047        .take(5)
3048        .copied()
3049        .collect::<Vec<_>>()
3050        .join(",");
3051    let snippet = if doc.is_empty() {
3052        top_export_signatures
3053            .first()
3054            .and_then(|signature| signature.as_deref())
3055            .map(|signature| truncate_chars(signature, 200))
3056            .unwrap_or_default()
3057    } else {
3058        doc.clone()
3059    };
3060
3061    SemanticChunk {
3062        file: file.to_path_buf(),
3063        name,
3064        kind: SymbolKind::FileSummary,
3065        start_line: 0,
3066        end_line: 0,
3067        exported: false,
3068        embed_text: truncate_chars(
3069            &format!(
3070                "file:{rel_path} kind:file-summary name:{} parent:{parent_dir} doc:{doc} exports:{exports}",
3071                file.file_stem()
3072                    .map(|stem| stem.to_string_lossy().to_string())
3073                    .unwrap_or_default()
3074            ),
3075            MAX_EMBED_TEXT_CHARS,
3076        ),
3077        snippet,
3078    }
3079}
3080
3081fn parser_for(
3082    parsers: &mut HashMap<crate::parser::LangId, Parser>,
3083    lang: crate::parser::LangId,
3084) -> Result<&mut Parser, String> {
3085    use std::collections::hash_map::Entry;
3086
3087    match parsers.entry(lang) {
3088        Entry::Occupied(entry) => Ok(entry.into_mut()),
3089        Entry::Vacant(entry) => {
3090            let grammar = grammar_for(lang);
3091            let mut parser = Parser::new();
3092            parser
3093                .set_language(&grammar)
3094                .map_err(|error| error.to_string())?;
3095            Ok(entry.insert(parser))
3096        }
3097    }
3098}
3099
3100pub fn is_semantic_indexed_extension(path: &Path) -> bool {
3101    matches!(
3102        path.extension().and_then(|extension| extension.to_str()),
3103        Some(
3104            "ts" | "tsx"
3105                | "js"
3106                | "jsx"
3107                | "py"
3108                | "rs"
3109                | "go"
3110                | "c"
3111                | "h"
3112                | "cc"
3113                | "cpp"
3114                | "cxx"
3115                | "hpp"
3116                | "hh"
3117                | "zig"
3118                | "cs"
3119                | "sh"
3120                | "bash"
3121                | "zsh"
3122                | "inc"
3123                | "php"
3124                | "sol"
3125                | "scss"
3126                | "vue"
3127                | "yaml"
3128                | "yml"
3129        )
3130    )
3131}
3132
3133fn collect_file_metadata(file: &Path) -> Result<IndexedFileMetadata, String> {
3134    let metadata = fs::metadata(file).map_err(|error| error.to_string())?;
3135    let mtime = metadata.modified().map_err(|error| error.to_string())?;
3136    let content_hash = cache_freshness::hash_file_if_small(file, metadata.len())
3137        .map_err(|error| error.to_string())?
3138        .unwrap_or_else(cache_freshness::zero_hash);
3139    Ok(IndexedFileMetadata {
3140        mtime,
3141        size: metadata.len(),
3142        content_hash,
3143    })
3144}
3145
3146fn canonicalize_existing_or_deleted_path(path: &Path) -> PathBuf {
3147    if let Ok(canonical) = fs::canonicalize(path) {
3148        return canonical;
3149    }
3150
3151    let Some(parent) = path.parent() else {
3152        return path.to_path_buf();
3153    };
3154    let Some(file_name) = path.file_name() else {
3155        return path.to_path_buf();
3156    };
3157
3158    fs::canonicalize(parent)
3159        .map(|canonical_parent| canonical_parent.join(file_name))
3160        .unwrap_or_else(|_| path.to_path_buf())
3161}
3162
3163/// Files larger than this are skipped for semantic chunking. The read +
3164/// tree-sitter parse is transiently O(file size) (tree-sitter can use several×
3165/// the source bytes), and `par_iter` collection parses many files at once, so an
3166/// unbounded read here is an OOM vector on a repo with a few multi-MB generated/
3167/// vendored/minified files. A file this large yields almost no useful embedding
3168/// anyway (each chunk's embed_text is clamped to MAX_EMBED_TEXT_CHARS), so we
3169/// track it (0 chunks) instead of reading it — freshness then skips it on later
3170/// refreshes. 4 MiB keeps essentially all hand-written source while capping the
3171/// pathological tail.
3172const MAX_SEMANTIC_FILE_BYTES: u64 = 4 * 1024 * 1024;
3173
3174fn collect_file_chunks(
3175    project_root: &Path,
3176    file: &Path,
3177    parsers: &mut HashMap<crate::parser::LangId, Parser>,
3178) -> Result<Vec<SemanticChunk>, String> {
3179    if !is_semantic_indexed_extension(file) {
3180        return Err("unsupported file extension".to_string());
3181    }
3182    let lang = detect_language(file).ok_or_else(|| "unsupported file extension".to_string())?;
3183    // OOM backstop: skip oversized files before the read + parse (tracked with
3184    // zero chunks by the caller, so freshness won't re-read them every refresh).
3185    if std::fs::metadata(file).is_ok_and(|m| m.len() > MAX_SEMANTIC_FILE_BYTES) {
3186        return Ok(Vec::new());
3187    }
3188    let source = std::fs::read_to_string(file).map_err(|error| error.to_string())?;
3189    let tree = parser_for(parsers, lang)?
3190        .parse(&source, None)
3191        .ok_or_else(|| format!("tree-sitter parse returned None for {}", file.display()))?;
3192    let symbols =
3193        extract_symbols_from_tree(&source, &tree, lang).map_err(|error| error.to_string())?;
3194
3195    Ok(symbols_to_chunks(file, &symbols, &source, project_root))
3196}
3197
3198/// Build a display snippet from a symbol's source
3199fn build_snippet(symbol: &Symbol, source: &str) -> String {
3200    let lines: Vec<&str> = source.lines().collect();
3201    let start = (symbol.range.start_line as usize).min(lines.len());
3202    // range.end_line is inclusive 0-based; +1 makes it an exclusive slice bound.
3203    let end = (symbol.range.end_line as usize + 1).min(lines.len());
3204    if start < end {
3205        let snippet_lines: Vec<&str> = lines[start..end].iter().take(5).copied().collect();
3206        let mut snippet = snippet_lines.join("\n");
3207        if end - start > 5 {
3208            snippet.push_str("\n  ...");
3209        }
3210        if snippet.len() > 300 {
3211            snippet = format!("{}...", &snippet[..snippet.floor_char_boundary(300)]);
3212        }
3213        snippet
3214    } else {
3215        String::new()
3216    }
3217}
3218
3219/// Convert symbols to semantic chunks with enriched context
3220fn symbols_to_chunks(
3221    file: &Path,
3222    symbols: &[Symbol],
3223    source: &str,
3224    project_root: &Path,
3225) -> Vec<SemanticChunk> {
3226    let mut chunks = Vec::new();
3227    let top_exports_with_signatures = symbols
3228        .iter()
3229        .filter(|symbol| {
3230            symbol.exported
3231                && symbol.parent.is_none()
3232                && !matches!(symbol.kind, SymbolKind::Heading)
3233        })
3234        .map(|symbol| (symbol.name.as_str(), symbol.signature.as_deref()))
3235        .collect::<Vec<_>>();
3236
3237    let has_only_headings = !symbols.is_empty()
3238        && symbols
3239            .iter()
3240            .all(|symbol| matches!(symbol.kind, SymbolKind::Heading));
3241    if top_exports_with_signatures.len() <= 2 && !has_only_headings {
3242        let top_exports = top_exports_with_signatures
3243            .iter()
3244            .map(|(name, _)| *name)
3245            .collect::<Vec<_>>();
3246        let top_export_signatures = top_exports_with_signatures
3247            .iter()
3248            .map(|(_, signature)| *signature)
3249            .collect::<Vec<_>>();
3250        chunks.push(build_file_summary_chunk(
3251            file,
3252            project_root,
3253            source,
3254            &top_exports,
3255            &top_export_signatures,
3256        ));
3257    }
3258
3259    for symbol in symbols {
3260        // Skip Markdown / HTML heading chunks: empirically they dominate result
3261        // lists even for code-shaped queries because heading prose embeds well.
3262        // Agents querying for code lose the actual matches under doc noise.
3263        // README/docs queries are still served by grep on the same files.
3264        if matches!(symbol.kind, SymbolKind::Heading) {
3265            continue;
3266        }
3267
3268        // Skip very small symbols (single-line variables, etc.)
3269        let line_count = symbol
3270            .range
3271            .end_line
3272            .saturating_sub(symbol.range.start_line)
3273            + 1;
3274        if line_count < 2 && !matches!(symbol.kind, SymbolKind::Variable) {
3275            continue;
3276        }
3277
3278        let embed_text = build_embed_text(symbol, source, file, project_root);
3279        let snippet = build_snippet(symbol, source);
3280
3281        chunks.push(SemanticChunk {
3282            file: file.to_path_buf(),
3283            name: symbol.name.clone(),
3284            kind: symbol.kind.clone(),
3285            start_line: symbol.range.start_line,
3286            end_line: symbol.range.end_line,
3287            exported: symbol.exported,
3288            embed_text,
3289            snippet,
3290        });
3291
3292        // Note: Nested symbols are handled separately by the outline system
3293        // Each symbol is indexed individually
3294    }
3295
3296    chunks
3297}
3298
3299fn semantic_score_order(a: &(f32, usize), b: &(f32, usize)) -> std::cmp::Ordering {
3300    b.0.partial_cmp(&a.0)
3301        .unwrap_or(std::cmp::Ordering::Equal)
3302        .then_with(|| a.1.cmp(&b.1))
3303}
3304
3305/// Cosine similarity between two vectors
3306fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
3307    if a.len() != b.len() {
3308        return 0.0;
3309    }
3310
3311    let mut dot = 0.0f32;
3312    let mut norm_a = 0.0f32;
3313    let mut norm_b = 0.0f32;
3314
3315    for i in 0..a.len() {
3316        dot += a[i] * b[i];
3317        norm_a += a[i] * a[i];
3318        norm_b += b[i] * b[i];
3319    }
3320
3321    let denom = norm_a.sqrt() * norm_b.sqrt();
3322    if denom == 0.0 {
3323        0.0
3324    } else {
3325        dot / denom
3326    }
3327}
3328
3329// Serialization helpers
3330fn symbol_kind_to_u8(kind: &SymbolKind) -> u8 {
3331    match kind {
3332        SymbolKind::Function => 0,
3333        SymbolKind::Class => 1,
3334        SymbolKind::Method => 2,
3335        SymbolKind::Struct => 3,
3336        SymbolKind::Interface => 4,
3337        SymbolKind::Enum => 5,
3338        SymbolKind::TypeAlias => 6,
3339        SymbolKind::Variable => 7,
3340        SymbolKind::Heading => 8,
3341        SymbolKind::FileSummary => 9,
3342    }
3343}
3344
3345fn u8_to_symbol_kind(v: u8) -> SymbolKind {
3346    match v {
3347        0 => SymbolKind::Function,
3348        1 => SymbolKind::Class,
3349        2 => SymbolKind::Method,
3350        3 => SymbolKind::Struct,
3351        4 => SymbolKind::Interface,
3352        5 => SymbolKind::Enum,
3353        6 => SymbolKind::TypeAlias,
3354        7 => SymbolKind::Variable,
3355        8 => SymbolKind::Heading,
3356        9 => SymbolKind::FileSummary,
3357        _ => SymbolKind::Heading,
3358    }
3359}
3360
3361#[cfg(test)]
3362mod tests {
3363    use super::*;
3364    use crate::config::{SemanticBackend, SemanticBackendConfig};
3365    use crate::parser::FileParser;
3366    use std::io::{Read, Write};
3367    use std::net::TcpListener;
3368    use std::thread;
3369
3370    #[test]
3371    fn semantic_index_includes_php_inc_and_scss_extensions() {
3372        for file in ["partial.inc", "index.php", "styles.scss"] {
3373            assert!(
3374                is_semantic_indexed_extension(Path::new(file)),
3375                "{file} should be semantic-index eligible"
3376            );
3377        }
3378    }
3379
3380    #[test]
3381    fn transient_marker_round_trips_and_classifies() {
3382        // A marked transient error is recognized and the marker is stripped for
3383        // display, leaving a clean message.
3384        let marked = format!("{TRANSIENT_EMBEDDING_MARKER}openai compatible request failed: error sending request for url (http://localhost:1234/v1/embeddings)");
3385        assert!(embedding_failure_is_transient(&marked));
3386        let clean = strip_transient_embedding_marker(&marked);
3387        assert!(!clean.contains(TRANSIENT_EMBEDDING_MARKER));
3388        assert!(clean.starts_with("openai compatible request failed:"));
3389
3390        // Permanent errors (HTTP 4xx, dimension mismatch) carry no marker and
3391        // are not classified transient — they must fail fast.
3392        for permanent in [
3393            "openai compatible request failed (HTTP 401): Unauthorized",
3394            "embedding dimension mismatch: index has 384, model returned 768",
3395            "too many files (>20000) for semantic indexing (max 20000)",
3396        ] {
3397            assert!(
3398                !embedding_failure_is_transient(permanent),
3399                "{permanent:?} must not be transient"
3400            );
3401            // Stripping a marker-free string is a no-op.
3402            assert_eq!(strip_transient_embedding_marker(permanent), permanent);
3403        }
3404    }
3405
3406    #[test]
3407    fn send_error_transience_separates_connect_timeout_from_4xx() {
3408        // 5xx / 429 are transient; other client errors are not.
3409        assert!(is_retryable_embedding_status(
3410            reqwest::StatusCode::INTERNAL_SERVER_ERROR
3411        ));
3412        assert!(is_retryable_embedding_status(
3413            reqwest::StatusCode::TOO_MANY_REQUESTS
3414        ));
3415        assert!(!is_retryable_embedding_status(
3416            reqwest::StatusCode::UNAUTHORIZED
3417        ));
3418        assert!(!is_retryable_embedding_status(
3419            reqwest::StatusCode::BAD_REQUEST
3420        ));
3421    }
3422
3423    #[test]
3424    fn local_backend_model_loading_body_is_transient() {
3425        // LM Studio / Ollama return a 4xx with a loading/unloaded message while
3426        // the model swaps; these must classify transient so the build self-heals.
3427        for body in [
3428            r#"{"error":"Model was unloaded while the request was still in queue.."}"#,
3429            r#"{"error":"model is loading, please wait"}"#,
3430            r#"{"error":"Model not loaded"}"#,
3431            "Loading model into memory",
3432        ] {
3433            assert!(
3434                embedding_response_body_is_transient(reqwest::StatusCode::BAD_REQUEST, body),
3435                "{body:?} should be body-transient"
3436            );
3437        }
3438
3439        // A genuine 4xx misconfiguration body must NOT be treated as transient,
3440        // even when it happens to contain generic words from the old broad
3441        // substring matcher.
3442        for body in [
3443            r#"{"error":"invalid api key"}"#,
3444            r#"{"error":"model 'foo' not found"}"#,
3445            "Bad Request: unknown field",
3446            "Bad Request: invalid loading model option",
3447            r#"{"error":"unauthorized while model is being loaded by another account"}"#,
3448        ] {
3449            assert!(
3450                !embedding_response_body_is_transient(reqwest::StatusCode::BAD_REQUEST, body),
3451                "{body:?} must not be body-transient"
3452            );
3453        }
3454
3455        assert!(
3456            !embedding_response_body_is_transient(
3457                reqwest::StatusCode::UNAUTHORIZED,
3458                r#"{"error":"model is loading, please wait"}"#
3459            ),
3460            "permanent auth failures must not become transient because of body text"
3461        );
3462    }
3463
3464    fn start_mock_http_server<F>(handler: F) -> (String, thread::JoinHandle<()>)
3465    where
3466        F: Fn(String, String, String) -> String + Send + 'static,
3467    {
3468        let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
3469        let addr = listener.local_addr().expect("local addr");
3470        let handle = thread::spawn(move || {
3471            let (mut stream, _) = listener.accept().expect("accept request");
3472            let mut buf = Vec::new();
3473            let mut chunk = [0u8; 4096];
3474            let mut header_end = None;
3475            let mut content_length = 0usize;
3476            loop {
3477                let n = stream.read(&mut chunk).expect("read request");
3478                if n == 0 {
3479                    break;
3480                }
3481                buf.extend_from_slice(&chunk[..n]);
3482                if header_end.is_none() {
3483                    if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
3484                        header_end = Some(pos + 4);
3485                        let headers = String::from_utf8_lossy(&buf[..pos + 4]);
3486                        for line in headers.lines() {
3487                            if let Some(value) = line.strip_prefix("Content-Length:") {
3488                                content_length = value.trim().parse::<usize>().unwrap_or(0);
3489                            }
3490                        }
3491                    }
3492                }
3493                if let Some(end) = header_end {
3494                    if buf.len() >= end + content_length {
3495                        break;
3496                    }
3497                }
3498            }
3499
3500            let end = header_end.expect("header terminator");
3501            let request = String::from_utf8_lossy(&buf[..end]).to_string();
3502            let body = String::from_utf8_lossy(&buf[end..end + content_length]).to_string();
3503            let mut lines = request.lines();
3504            let request_line = lines.next().expect("request line").to_string();
3505            let path = request_line
3506                .split_whitespace()
3507                .nth(1)
3508                .expect("request path")
3509                .to_string();
3510            let response_body = handler(request_line, path, body);
3511            let response = format!(
3512                "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
3513                response_body.len(),
3514                response_body
3515            );
3516            stream
3517                .write_all(response.as_bytes())
3518                .expect("write response");
3519        });
3520
3521        (format!("http://{}", addr), handle)
3522    }
3523
3524    fn start_truncated_body_server(attempts: usize) -> (String, thread::JoinHandle<()>) {
3525        let listener = TcpListener::bind("127.0.0.1:0").expect("bind truncated test server");
3526        listener
3527            .set_nonblocking(true)
3528            .expect("nonblocking listener");
3529        let addr = listener.local_addr().expect("local addr");
3530        let handle = thread::spawn(move || {
3531            let deadline = std::time::Instant::now() + Duration::from_secs(2);
3532            let mut accepted = 0usize;
3533            while accepted < attempts && std::time::Instant::now() < deadline {
3534                match listener.accept() {
3535                    Ok((mut stream, _)) => {
3536                        accepted += 1;
3537                        let mut buf = [0u8; 4096];
3538                        // The client (under test) uses a 250ms timeout and drops
3539                        // the connection when the truncated body never completes.
3540                        // On Windows that disconnect surfaces as a hard socket
3541                        // error (WSAECONNRESET) on these read/write calls, where
3542                        // Unix returns a clean EOF. Tolerate both: the mock does
3543                        // not need the request bytes, and a write to an
3544                        // already-hung-up client is expected.
3545                        let _ = stream.read(&mut buf);
3546                        let response = "HTTP/1.1 200 OK
3547Content-Type: application/json
3548Content-Length: 128
3549Connection: close
3550
3551{";
3552                        let _ = stream.write_all(response.as_bytes());
3553                    }
3554                    Err(error) if error.kind() == std::io::ErrorKind::WouldBlock => {
3555                        thread::sleep(Duration::from_millis(10));
3556                    }
3557                    Err(error) => panic!("accept request: {error}"),
3558                }
3559            }
3560        });
3561
3562        (format!("http://{}", addr), handle)
3563    }
3564
3565    #[test]
3566    fn response_body_read_failures_are_marked_transient() {
3567        let (url, handle) = start_truncated_body_server(EMBEDDING_REQUEST_MAX_ATTEMPTS);
3568        let client = Client::builder()
3569            .timeout(Duration::from_millis(250))
3570            .build()
3571            .expect("client");
3572
3573        let error = send_embedding_request(|| client.post(&url).body("{}"), "test backend")
3574            .expect_err("truncated body should fail");
3575
3576        handle.join().unwrap();
3577        assert!(
3578            embedding_failure_is_transient(&error),
3579            "body read failures should be transient-marked: {error}"
3580        );
3581        assert!(error.contains("response read failed"));
3582    }
3583
3584    fn test_vector_for_texts(texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
3585        Ok(texts.iter().map(|_| vec![1.0, 0.0, 0.0]).collect())
3586    }
3587
3588    fn write_rust_file(path: &Path, function_name: &str) {
3589        fs::write(
3590            path,
3591            format!("pub fn {function_name}() -> bool {{\n    true\n}}\n"),
3592        )
3593        .unwrap();
3594    }
3595
3596    fn build_test_index(project_root: &Path, files: &[PathBuf]) -> SemanticIndex {
3597        let mut embed = test_vector_for_texts;
3598        SemanticIndex::build(project_root, files, &mut embed, 8).unwrap()
3599    }
3600
3601    fn test_project_root() -> PathBuf {
3602        std::env::current_dir().unwrap()
3603    }
3604
3605    fn set_file_metadata(index: &mut SemanticIndex, file: &Path, mtime: SystemTime, size: u64) {
3606        index.file_mtimes.insert(file.to_path_buf(), mtime);
3607        index.file_sizes.insert(file.to_path_buf(), size);
3608        index
3609            .file_hashes
3610            .insert(file.to_path_buf(), cache_freshness::zero_hash());
3611    }
3612
3613    fn legacy_semantic_index_bytes(index: &SemanticIndex) -> Vec<u8> {
3614        let mut buf = Vec::new();
3615        let fingerprint_bytes = index.fingerprint.as_ref().and_then(|fingerprint| {
3616            let encoded = fingerprint.as_string();
3617            if encoded.is_empty() {
3618                None
3619            } else {
3620                Some(encoded.into_bytes())
3621            }
3622        });
3623        let file_mtimes: Vec<_> = index
3624            .file_mtimes
3625            .iter()
3626            .filter_map(|(path, mtime)| {
3627                cache_relative_path(&index.project_root, path)
3628                    .map(|relative| (relative, path, mtime))
3629            })
3630            .collect();
3631        let entries: Vec<_> = index
3632            .entries
3633            .iter()
3634            .filter_map(|entry| {
3635                cache_relative_path(&index.project_root, &entry.chunk.file)
3636                    .map(|relative| (relative, entry))
3637            })
3638            .collect();
3639
3640        buf.push(SEMANTIC_INDEX_VERSION_V6);
3641        buf.extend_from_slice(&(index.dimension as u32).to_le_bytes());
3642        buf.extend_from_slice(&(entries.len() as u32).to_le_bytes());
3643        let fp_bytes_ref: &[u8] = fingerprint_bytes.as_deref().unwrap_or(&[]);
3644        buf.extend_from_slice(&(fp_bytes_ref.len() as u32).to_le_bytes());
3645        buf.extend_from_slice(fp_bytes_ref);
3646
3647        buf.extend_from_slice(&(file_mtimes.len() as u32).to_le_bytes());
3648        for (relative, path, mtime) in &file_mtimes {
3649            let path_bytes = relative.to_string_lossy().as_bytes().to_vec();
3650            buf.extend_from_slice(&(path_bytes.len() as u32).to_le_bytes());
3651            buf.extend_from_slice(&path_bytes);
3652            let duration = mtime
3653                .duration_since(SystemTime::UNIX_EPOCH)
3654                .unwrap_or_default();
3655            buf.extend_from_slice(&duration.as_secs().to_le_bytes());
3656            buf.extend_from_slice(&duration.subsec_nanos().to_le_bytes());
3657            let size = index.file_sizes.get(*path).copied().unwrap_or_default();
3658            buf.extend_from_slice(&size.to_le_bytes());
3659            let hash = index
3660                .file_hashes
3661                .get(*path)
3662                .copied()
3663                .unwrap_or_else(cache_freshness::zero_hash);
3664            buf.extend_from_slice(hash.as_bytes());
3665        }
3666
3667        for (relative, entry) in &entries {
3668            let c = &entry.chunk;
3669            let file_bytes = relative.to_string_lossy().as_bytes().to_vec();
3670            buf.extend_from_slice(&(file_bytes.len() as u32).to_le_bytes());
3671            buf.extend_from_slice(&file_bytes);
3672
3673            let name_bytes = c.name.as_bytes();
3674            buf.extend_from_slice(&(name_bytes.len() as u32).to_le_bytes());
3675            buf.extend_from_slice(name_bytes);
3676
3677            buf.push(symbol_kind_to_u8(&c.kind));
3678            buf.extend_from_slice(&(c.start_line as u32).to_le_bytes());
3679            buf.extend_from_slice(&(c.end_line as u32).to_le_bytes());
3680            buf.push(c.exported as u8);
3681
3682            let snippet_bytes = c.snippet.as_bytes();
3683            buf.extend_from_slice(&(snippet_bytes.len() as u32).to_le_bytes());
3684            buf.extend_from_slice(snippet_bytes);
3685
3686            let embed_bytes = c.embed_text.as_bytes();
3687            buf.extend_from_slice(&(embed_bytes.len() as u32).to_le_bytes());
3688            buf.extend_from_slice(embed_bytes);
3689
3690            for &val in &entry.vector {
3691                buf.extend_from_slice(&val.to_le_bytes());
3692            }
3693        }
3694
3695        buf
3696    }
3697
3698    #[derive(Default)]
3699    struct RecordingEmbedder {
3700        calls: Vec<Vec<String>>,
3701    }
3702
3703    impl RecordingEmbedder {
3704        fn embed(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
3705            let vectors = texts
3706                .iter()
3707                .map(|text| deterministic_test_vector(text))
3708                .collect();
3709            self.calls.push(texts);
3710            Ok(vectors)
3711        }
3712
3713        fn total_embedded_texts(&self) -> usize {
3714            self.calls.iter().map(Vec::len).sum()
3715        }
3716
3717        fn embedded_texts(&self) -> Vec<&str> {
3718            self.calls
3719                .iter()
3720                .flat_map(|batch| batch.iter().map(String::as_str))
3721                .collect()
3722        }
3723    }
3724
3725    fn deterministic_test_vector(text: &str) -> Vec<f32> {
3726        let hash = blake3::hash(text.as_bytes());
3727        let bytes = hash.as_bytes();
3728        vec![
3729            1.0,
3730            bytes[0] as f32 / 255.0,
3731            bytes[1] as f32 / 255.0,
3732            bytes[2] as f32 / 255.0,
3733        ]
3734    }
3735
3736    fn build_recorded_test_index(project_root: &Path, files: &[PathBuf]) -> SemanticIndex {
3737        let mut embedder = RecordingEmbedder::default();
3738        let mut embed = |texts: Vec<String>| embedder.embed(texts);
3739        SemanticIndex::build(project_root, files, &mut embed, 16).unwrap()
3740    }
3741
3742    fn force_stale(index: &mut SemanticIndex, file: &Path) {
3743        set_file_metadata(index, file, SystemTime::UNIX_EPOCH, 0);
3744    }
3745
3746    fn write_source(path: &Path, source: &str) {
3747        if let Some(parent) = path.parent() {
3748            fs::create_dir_all(parent).unwrap();
3749        }
3750        fs::write(path, source).unwrap();
3751    }
3752
3753    fn entries_for_file<'a>(index: &'a SemanticIndex, file: &Path) -> Vec<&'a EmbeddingEntry> {
3754        index
3755            .entries
3756            .iter()
3757            .filter(|entry| entry.chunk.file == file)
3758            .collect()
3759    }
3760
3761    fn entry_by_name<'a>(index: &'a SemanticIndex, file: &Path, name: &str) -> &'a EmbeddingEntry {
3762        index
3763            .entries
3764            .iter()
3765            .find(|entry| entry.chunk.file == file && entry.chunk.name == name)
3766            .unwrap_or_else(|| panic!("missing semantic entry {name} in {}", file.display()))
3767    }
3768
3769    fn file_summary_entry<'a>(index: &'a SemanticIndex, file: &Path) -> &'a EmbeddingEntry {
3770        index
3771            .entries
3772            .iter()
3773            .find(|entry| entry.chunk.file == file && entry.chunk.kind == SymbolKind::FileSummary)
3774            .unwrap_or_else(|| panic!("missing file-summary entry in {}", file.display()))
3775    }
3776
3777    #[test]
3778    fn refresh_stale_line_shift_reuses_all_chunks_and_retains_entries() {
3779        let temp = tempfile::tempdir().unwrap();
3780        let project_root = temp.path();
3781        let file = project_root.join("src/lib.rs");
3782        let original = "pub fn alpha() -> i32 {\n    1\n}\n\npub fn beta() -> i32 {\n    2\n}\n";
3783        write_source(&file, original);
3784
3785        let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
3786        let original_entry_count = index.entries.len();
3787        let original_alpha_vector = entry_by_name(&index, &file, "alpha").vector.clone();
3788
3789        write_source(&file, &format!("\n{original}"));
3790        force_stale(&mut index, &file);
3791
3792        let mut embedder = RecordingEmbedder::default();
3793        let mut embed = |texts: Vec<String>| embedder.embed(texts);
3794        let mut progress = |_done: usize, _total: usize| {};
3795        let summary = index
3796            .refresh_stale_files(
3797                project_root,
3798                std::slice::from_ref(&file),
3799                &mut embed,
3800                16,
3801                &mut progress,
3802            )
3803            .unwrap();
3804
3805        assert_eq!(summary.changed, 1);
3806        assert_eq!(embedder.total_embedded_texts(), 0);
3807        assert_eq!(index.entries.len(), original_entry_count);
3808        let shifted_alpha = entry_by_name(&index, &file, "alpha");
3809        assert_eq!(shifted_alpha.chunk.start_line, 1);
3810        assert_eq!(shifted_alpha.vector, original_alpha_vector);
3811    }
3812
3813    #[test]
3814    fn refresh_invalidated_line_shift_emits_full_replacement_delta_for_apply() {
3815        let temp = tempfile::tempdir().unwrap();
3816        let project_root = temp.path();
3817        let file = project_root.join("src/lib.rs");
3818        let original = "pub fn alpha() -> i32 {\n    1\n}\n\npub fn beta() -> i32 {\n    2\n}\n";
3819        write_source(&file, original);
3820
3821        let mut worker_index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
3822        let mut serving_index = worker_index.clone();
3823        let original_entry_count = worker_index.entries.len();
3824
3825        write_source(&file, &format!("\n{original}"));
3826
3827        let mut embedder = RecordingEmbedder::default();
3828        let mut embed = |texts: Vec<String>| embedder.embed(texts);
3829        let mut progress = |_done: usize, _total: usize| {};
3830        let update = worker_index
3831            .refresh_invalidated_files(
3832                project_root,
3833                std::slice::from_ref(&file),
3834                &mut embed,
3835                16,
3836                100,
3837                &mut progress,
3838            )
3839            .unwrap();
3840
3841        assert_eq!(embedder.total_embedded_texts(), 0);
3842        assert_eq!(update.added_entries.len(), original_entry_count);
3843        assert_eq!(worker_index.entries.len(), original_entry_count);
3844
3845        serving_index.apply_refresh_update(
3846            update.added_entries,
3847            update.updated_metadata,
3848            &update.completed_paths,
3849        );
3850
3851        assert_eq!(serving_index.entries.len(), original_entry_count);
3852        assert_eq!(
3853            entries_for_file(&serving_index, &file).len(),
3854            original_entry_count
3855        );
3856        assert_eq!(
3857            entry_by_name(&serving_index, &file, "alpha")
3858                .chunk
3859                .start_line,
3860            1
3861        );
3862    }
3863
3864    #[test]
3865    fn refresh_invalidated_one_symbol_edit_embeds_only_changed_symbol() {
3866        let temp = tempfile::tempdir().unwrap();
3867        let project_root = temp.path();
3868        let file = project_root.join("src/lib.rs");
3869        write_source(
3870            &file,
3871            "pub fn alpha() -> i32 {\n    1\n}\n\npub fn beta() -> i32 {\n    2\n}\n",
3872        );
3873
3874        let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
3875        let original_entry_count = index.entries.len();
3876        let beta_vector = entry_by_name(&index, &file, "beta").vector.clone();
3877
3878        write_source(
3879            &file,
3880            "pub fn alpha() -> i32 {\n    10\n}\n\npub fn beta() -> i32 {\n    2\n}\n",
3881        );
3882
3883        let mut embedder = RecordingEmbedder::default();
3884        let mut embed = |texts: Vec<String>| embedder.embed(texts);
3885        let mut progress = |_done: usize, _total: usize| {};
3886        let update = index
3887            .refresh_invalidated_files(
3888                project_root,
3889                std::slice::from_ref(&file),
3890                &mut embed,
3891                16,
3892                100,
3893                &mut progress,
3894            )
3895            .unwrap();
3896
3897        assert_eq!(embedder.total_embedded_texts(), 1);
3898        assert!(embedder.embedded_texts()[0].contains("name:alpha"));
3899        assert_eq!(update.added_entries.len(), original_entry_count);
3900        assert_eq!(entry_by_name(&index, &file, "beta").vector, beta_vector);
3901    }
3902
3903    #[test]
3904    fn refresh_reuses_one_old_vector_for_two_byte_identical_symbols() {
3905        let temp = tempfile::tempdir().unwrap();
3906        let project_root = temp.path();
3907        let file = project_root.join("src/dupe.js");
3908        let one_duplicate = "function duplicate() {\n  return 1;\n}\n";
3909        write_source(&file, one_duplicate);
3910
3911        let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
3912        let original_vector = entry_by_name(&index, &file, "duplicate").vector.clone();
3913
3914        write_source(&file, &format!("{one_duplicate}\n{one_duplicate}"));
3915
3916        let mut embedder = RecordingEmbedder::default();
3917        let mut embed = |texts: Vec<String>| embedder.embed(texts);
3918        let mut progress = |_done: usize, _total: usize| {};
3919        index
3920            .refresh_invalidated_files(
3921                project_root,
3922                std::slice::from_ref(&file),
3923                &mut embed,
3924                16,
3925                100,
3926                &mut progress,
3927            )
3928            .unwrap();
3929
3930        let duplicate_entries = index
3931            .entries
3932            .iter()
3933            .filter(|entry| entry.chunk.file == file && entry.chunk.name == "duplicate")
3934            .collect::<Vec<_>>();
3935        assert_eq!(duplicate_entries.len(), 2);
3936        assert_eq!(embedder.total_embedded_texts(), 0);
3937        assert_eq!(duplicate_entries[0].vector, original_vector);
3938        assert_eq!(duplicate_entries[1].vector, original_vector);
3939    }
3940
3941    #[test]
3942    fn file_summary_reuses_on_body_edit_and_misses_on_leading_doc_edit() {
3943        let temp = tempfile::tempdir().unwrap();
3944        let project_root = temp.path();
3945        let file = project_root.join("src/lib.rs");
3946        write_source(
3947            &file,
3948            "//! module docs v1\n\npub fn alpha() -> i32 {\n    1\n}\n",
3949        );
3950
3951        let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
3952        let summary_before = file_summary_entry(&index, &file).vector.clone();
3953
3954        write_source(
3955            &file,
3956            "//! module docs v1\n\npub fn alpha() -> i32 {\n    2\n}\n",
3957        );
3958        let mut body_embedder = RecordingEmbedder::default();
3959        let mut body_embed = |texts: Vec<String>| body_embedder.embed(texts);
3960        let mut progress = |_done: usize, _total: usize| {};
3961        index
3962            .refresh_invalidated_files(
3963                project_root,
3964                std::slice::from_ref(&file),
3965                &mut body_embed,
3966                16,
3967                100,
3968                &mut progress,
3969            )
3970            .unwrap();
3971        assert_eq!(body_embedder.total_embedded_texts(), 1);
3972        assert!(body_embedder.embedded_texts()[0].contains("name:alpha"));
3973        assert_eq!(file_summary_entry(&index, &file).vector, summary_before);
3974
3975        write_source(
3976            &file,
3977            "//! module docs v2\n\npub fn alpha() -> i32 {\n    2\n}\n",
3978        );
3979        let mut doc_embedder = RecordingEmbedder::default();
3980        let mut doc_embed = |texts: Vec<String>| doc_embedder.embed(texts);
3981        index
3982            .refresh_invalidated_files(
3983                project_root,
3984                std::slice::from_ref(&file),
3985                &mut doc_embed,
3986                16,
3987                100,
3988                &mut progress,
3989            )
3990            .unwrap();
3991
3992        assert_eq!(doc_embedder.total_embedded_texts(), 1);
3993        assert!(doc_embedder.embedded_texts()[0].contains("kind:file-summary"));
3994        assert_ne!(file_summary_entry(&index, &file).vector, summary_before);
3995    }
3996
3997    #[test]
3998    fn refresh_invalidated_deleted_file_drops_entries_without_embedding() {
3999        let temp = tempfile::tempdir().unwrap();
4000        let project_root = temp.path();
4001        let file = project_root.join("src/lib.rs");
4002        write_source(&file, "pub fn alpha() -> i32 {\n    1\n}\n");
4003
4004        let mut worker_index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
4005        let mut serving_index = worker_index.clone();
4006        fs::remove_file(&file).unwrap();
4007
4008        let mut embedder = RecordingEmbedder::default();
4009        let mut embed = |texts: Vec<String>| embedder.embed(texts);
4010        let mut progress = |_done: usize, _total: usize| {};
4011        let update = worker_index
4012            .refresh_invalidated_files(
4013                project_root,
4014                std::slice::from_ref(&file),
4015                &mut embed,
4016                16,
4017                100,
4018                &mut progress,
4019            )
4020            .unwrap();
4021
4022        assert_eq!(update.summary.deleted, 1);
4023        assert_eq!(embedder.total_embedded_texts(), 0);
4024        assert!(worker_index.entries.is_empty());
4025
4026        serving_index.apply_refresh_update(
4027            update.added_entries,
4028            update.updated_metadata,
4029            &update.completed_paths,
4030        );
4031        assert!(serving_index.entries.is_empty());
4032    }
4033
4034    #[test]
4035    fn watcher_collect_failure_does_not_resurrect_stale_entries() {
4036        let temp = tempfile::tempdir().unwrap();
4037        let project_root = temp.path();
4038        let file = project_root.join("src/lib.rs");
4039        write_source(&file, "pub fn alpha() -> i32 {\n    1\n}\n");
4040
4041        let mut worker_index = build_recorded_test_index(project_root, std::slice::from_ref(&file));
4042        let mut serving_index = worker_index.clone();
4043        fs::write(&file, [0xff, 0xfe, 0xfd]).unwrap();
4044
4045        let mut embedder = RecordingEmbedder::default();
4046        let mut embed = |texts: Vec<String>| embedder.embed(texts);
4047        let mut progress = |_done: usize, _total: usize| {};
4048        let update = worker_index
4049            .refresh_invalidated_files(
4050                project_root,
4051                std::slice::from_ref(&file),
4052                &mut embed,
4053                16,
4054                100,
4055                &mut progress,
4056            )
4057            .unwrap();
4058
4059        assert_eq!(embedder.total_embedded_texts(), 0);
4060        assert!(update.added_entries.is_empty());
4061        assert!(worker_index.entries.is_empty());
4062        assert!(!worker_index.file_mtimes.contains_key(&file));
4063
4064        serving_index.apply_refresh_update(
4065            update.added_entries,
4066            update.updated_metadata,
4067            &update.completed_paths,
4068        );
4069        assert!(serving_index.entries.is_empty());
4070        assert!(!serving_index.file_mtimes.contains_key(&file));
4071    }
4072
4073    #[test]
4074    fn refresh_invalidated_cap_deferral_remains_file_count_based() {
4075        let temp = tempfile::tempdir().unwrap();
4076        let project_root = temp.path();
4077        let indexed = project_root.join("src/a.rs");
4078        let deferred = project_root.join("src/b.rs");
4079        write_source(&indexed, "pub fn alpha() -> i32 {\n    1\n}\n");
4080        write_source(&deferred, "pub fn beta() -> i32 {\n    2\n}\n");
4081
4082        let mut index = build_recorded_test_index(project_root, std::slice::from_ref(&indexed));
4083        let mut embedder = RecordingEmbedder::default();
4084        let mut embed = |texts: Vec<String>| embedder.embed(texts);
4085        let mut progress = |_done: usize, _total: usize| {};
4086        let update = index
4087            .refresh_invalidated_files(
4088                project_root,
4089                std::slice::from_ref(&deferred),
4090                &mut embed,
4091                16,
4092                1,
4093                &mut progress,
4094            )
4095            .unwrap();
4096
4097        assert_eq!(update.summary.total_processed, 1);
4098        assert_eq!(update.summary.added, 0);
4099        assert_eq!(embedder.total_embedded_texts(), 0);
4100        assert_eq!(index.indexed_file_count(), 1);
4101        assert!(index.deferred_files.contains(&deferred));
4102        assert!(entries_for_file(&index, &deferred).is_empty());
4103    }
4104
4105    #[test]
4106    fn semantic_cache_serialization_skips_paths_outside_project_root() {
4107        let dir = tempfile::tempdir().expect("create temp dir");
4108        let project = fs::canonicalize(dir.path()).expect("canonical project");
4109        let outside = project.join("..").join("outside.rs");
4110        let mut index = SemanticIndex::new(project.clone(), 3);
4111        index
4112            .file_mtimes
4113            .insert(outside.clone(), SystemTime::UNIX_EPOCH);
4114        index.file_sizes.insert(outside.clone(), 1);
4115        index
4116            .file_hashes
4117            .insert(outside.clone(), cache_freshness::zero_hash());
4118        index.entries.push(EmbeddingEntry {
4119            chunk: SemanticChunk {
4120                file: outside,
4121                name: "outside".to_string(),
4122                kind: SymbolKind::Function,
4123                start_line: 0,
4124                end_line: 0,
4125                exported: false,
4126                embed_text: "outside".to_string(),
4127                snippet: "outside".to_string(),
4128            },
4129            vector: vec![1.0, 0.0, 0.0],
4130        });
4131
4132        let bytes = index.to_bytes();
4133        let loaded = SemanticIndex::from_bytes(&bytes, &project).expect("load serialized index");
4134        assert_eq!(loaded.entries.len(), 0);
4135        assert!(loaded.file_mtimes.is_empty());
4136    }
4137
4138    #[test]
4139    fn semantic_search_bounded_top_k_matches_reference_full_sort() {
4140        let project_root = test_project_root();
4141        let file = project_root.join("src/lib.rs");
4142        let mut index = SemanticIndex::new(project_root, 2);
4143        let entries = [
4144            ("alpha", vec![1.0, 0.0], false),
4145            ("beta", vec![0.0, 1.0], false),
4146            ("gamma", vec![1.0, 0.0], false),
4147            ("delta", vec![0.5, 0.5], true),
4148            ("epsilon", vec![-1.0, 0.0], false),
4149        ];
4150        for (line, (name, vector, exported)) in entries.into_iter().enumerate() {
4151            index.entries.push(EmbeddingEntry {
4152                chunk: SemanticChunk {
4153                    file: file.clone(),
4154                    name: name.to_string(),
4155                    kind: SymbolKind::Function,
4156                    start_line: line as u32 + 1,
4157                    end_line: line as u32 + 1,
4158                    exported,
4159                    embed_text: name.to_string(),
4160                    snippet: format!("fn {name}() {{}}"),
4161                },
4162                vector,
4163            });
4164        }
4165
4166        let query = vec![1.0, 0.0];
4167        let top_k = 4;
4168        let mut reference: Vec<(f32, usize)> = index
4169            .entries
4170            .iter()
4171            .enumerate()
4172            .map(|(idx, entry)| {
4173                let mut score = cosine_similarity(&query, &entry.vector);
4174                if entry.chunk.exported {
4175                    score *= 1.1;
4176                }
4177                (score, idx)
4178            })
4179            .collect();
4180        reference.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
4181        let expected: Vec<(String, f32)> = reference
4182            .into_iter()
4183            .take(top_k)
4184            .map(|(score, idx)| (index.entries[idx].chunk.name.clone(), score))
4185            .collect();
4186
4187        let actual: Vec<(String, f32)> = index
4188            .search(&query, top_k)
4189            .into_iter()
4190            .map(|result| (result.name, result.score))
4191            .collect();
4192
4193        assert_eq!(
4194            actual.iter().map(|(name, _)| name).collect::<Vec<_>>(),
4195            expected.iter().map(|(name, _)| name).collect::<Vec<_>>()
4196        );
4197        for ((_, actual_score), (_, expected_score)) in actual.iter().zip(expected.iter()) {
4198            assert!((actual_score - expected_score).abs() < 1e-6);
4199        }
4200        assert_eq!(actual[0].0, "alpha");
4201        assert_eq!(actual[1].0, "gamma", "equal scores keep insertion order");
4202        assert!(index.search(&query, 0).is_empty());
4203    }
4204
4205    #[test]
4206    fn test_cosine_similarity_identical() {
4207        let a = vec![1.0, 0.0, 0.0];
4208        let b = vec![1.0, 0.0, 0.0];
4209        assert!((cosine_similarity(&a, &b) - 1.0).abs() < 0.001);
4210    }
4211
4212    #[test]
4213    fn test_cosine_similarity_orthogonal() {
4214        let a = vec![1.0, 0.0, 0.0];
4215        let b = vec![0.0, 1.0, 0.0];
4216        assert!(cosine_similarity(&a, &b).abs() < 0.001);
4217    }
4218
4219    #[test]
4220    fn test_cosine_similarity_opposite() {
4221        let a = vec![1.0, 0.0, 0.0];
4222        let b = vec![-1.0, 0.0, 0.0];
4223        assert!((cosine_similarity(&a, &b) + 1.0).abs() < 0.001);
4224    }
4225
4226    #[test]
4227    fn test_serialization_roundtrip() {
4228        let project_root = test_project_root();
4229        let file = project_root.join("src/main.rs");
4230        let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
4231        index.entries.push(EmbeddingEntry {
4232            chunk: SemanticChunk {
4233                file: file.clone(),
4234                name: "handle_request".to_string(),
4235                kind: SymbolKind::Function,
4236                start_line: 10,
4237                end_line: 25,
4238                exported: true,
4239                embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
4240                snippet: "fn handle_request() {\n  // ...\n}".to_string(),
4241            },
4242            vector: vec![0.1, 0.2, 0.3, 0.4],
4243        });
4244        index.dimension = 4;
4245        index
4246            .file_mtimes
4247            .insert(file.clone(), SystemTime::UNIX_EPOCH);
4248        index.file_sizes.insert(file, 0);
4249        index.set_fingerprint(SemanticIndexFingerprint {
4250            backend: "fastembed".to_string(),
4251            model: "all-MiniLM-L6-v2".to_string(),
4252            base_url: FALLBACK_BACKEND.to_string(),
4253            dimension: 4,
4254            chunking_version: default_chunking_version(),
4255        });
4256
4257        let bytes = index.to_bytes();
4258        let restored = SemanticIndex::from_bytes(&bytes, &project_root).unwrap();
4259
4260        assert_eq!(restored.entries.len(), 1);
4261        assert_eq!(restored.entries[0].chunk.name, "handle_request");
4262        assert_eq!(restored.entries[0].vector, vec![0.1, 0.2, 0.3, 0.4]);
4263        assert_eq!(restored.dimension, 4);
4264        assert_eq!(restored.backend_label(), Some("fastembed"));
4265        assert_eq!(restored.model_label(), Some("all-MiniLM-L6-v2"));
4266    }
4267
4268    #[test]
4269    fn semantic_cache_streaming_persistence_matches_legacy_bytes_and_round_trips() {
4270        let storage = tempfile::tempdir().expect("create storage dir");
4271        let project = storage.path().join("project");
4272        fs::create_dir_all(project.join("src")).expect("create project src");
4273        let file = project.join("src/lib.rs");
4274        fs::write(&file, "pub fn alpha() {}\npub fn beta() {}\n").expect("write source");
4275        let project_root = fs::canonicalize(&project).expect("canonical project");
4276        let file = fs::canonicalize(&file).expect("canonical file");
4277
4278        let mut index = SemanticIndex::new(project_root.clone(), 3);
4279        let mtime = SystemTime::UNIX_EPOCH + Duration::new(123, 456);
4280        index.file_mtimes.insert(file.clone(), mtime);
4281        index.file_sizes.insert(file.clone(), 42);
4282        index
4283            .file_hashes
4284            .insert(file.clone(), cache_freshness::zero_hash());
4285        index.entries.push(EmbeddingEntry {
4286            chunk: SemanticChunk {
4287                file: file.clone(),
4288                name: "alpha".to_string(),
4289                kind: SymbolKind::Function,
4290                start_line: 0,
4291                end_line: 0,
4292                exported: true,
4293                embed_text: "file:src/lib.rs kind:function name:alpha".to_string(),
4294                snippet: "pub fn alpha() {}".to_string(),
4295            },
4296            vector: vec![0.1, 0.2, 0.3],
4297        });
4298        index.entries.push(EmbeddingEntry {
4299            chunk: SemanticChunk {
4300                file: file.clone(),
4301                name: "beta".to_string(),
4302                kind: SymbolKind::Function,
4303                start_line: 1,
4304                end_line: 1,
4305                exported: true,
4306                embed_text: "file:src/lib.rs kind:function name:beta".to_string(),
4307                snippet: "pub fn beta() {}".to_string(),
4308            },
4309            vector: vec![0.4, 0.5, 0.6],
4310        });
4311        let fingerprint = SemanticIndexFingerprint {
4312            backend: "fastembed".to_string(),
4313            model: "all-MiniLM-L6-v2".to_string(),
4314            base_url: FALLBACK_BACKEND.to_string(),
4315            dimension: 3,
4316            chunking_version: default_chunking_version(),
4317        };
4318        index.set_fingerprint(fingerprint.clone());
4319
4320        let legacy_bytes = legacy_semantic_index_bytes(&index);
4321        assert_eq!(index.to_bytes(), legacy_bytes);
4322
4323        index.write_to_disk(storage.path(), "proj");
4324        let data_path = storage.path().join("semantic/proj/semantic.bin");
4325        assert_eq!(
4326            fs::read(&data_path).expect("read semantic.bin"),
4327            legacy_bytes
4328        );
4329
4330        let loaded = SemanticIndex::read_from_disk(
4331            storage.path(),
4332            "proj",
4333            &project_root,
4334            false,
4335            Some(&fingerprint.as_string()),
4336        )
4337        .expect("load semantic index");
4338        assert_eq!(loaded.entries.len(), index.entries.len());
4339        assert_eq!(loaded.dimension, index.dimension);
4340        assert_eq!(
4341            loaded.fingerprint().unwrap().as_string(),
4342            fingerprint.as_string()
4343        );
4344        assert_eq!(loaded.file_mtimes.get(&file), Some(&mtime));
4345        assert_eq!(loaded.file_sizes.get(&file), Some(&42));
4346        assert_eq!(
4347            loaded.file_hashes.get(&file),
4348            Some(&cache_freshness::zero_hash())
4349        );
4350        for (actual, expected) in loaded.entries.iter().zip(index.entries.iter()) {
4351            assert_eq!(actual.chunk.file, expected.chunk.file);
4352            assert_eq!(actual.chunk.name, expected.chunk.name);
4353            assert_eq!(actual.chunk.kind, expected.chunk.kind);
4354            assert_eq!(actual.chunk.start_line, expected.chunk.start_line);
4355            assert_eq!(actual.chunk.end_line, expected.chunk.end_line);
4356            assert_eq!(actual.chunk.exported, expected.chunk.exported);
4357            assert_eq!(actual.chunk.embed_text, expected.chunk.embed_text);
4358            assert_eq!(actual.chunk.snippet, expected.chunk.snippet);
4359            assert_eq!(actual.vector, expected.vector);
4360        }
4361        assert_eq!(loaded.to_bytes(), legacy_bytes);
4362    }
4363
4364    #[test]
4365    fn symbol_kind_serialization_roundtrip_includes_file_summary_variant() {
4366        let cases = [
4367            (SymbolKind::Function, 0),
4368            (SymbolKind::Class, 1),
4369            (SymbolKind::Method, 2),
4370            (SymbolKind::Struct, 3),
4371            (SymbolKind::Interface, 4),
4372            (SymbolKind::Enum, 5),
4373            (SymbolKind::TypeAlias, 6),
4374            (SymbolKind::Variable, 7),
4375            (SymbolKind::Heading, 8),
4376            (SymbolKind::FileSummary, 9),
4377        ];
4378
4379        for (kind, encoded) in cases {
4380            assert_eq!(symbol_kind_to_u8(&kind), encoded);
4381            assert_eq!(u8_to_symbol_kind(encoded), kind);
4382        }
4383    }
4384
4385    #[test]
4386    fn test_search_top_k() {
4387        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4388        index.dimension = 3;
4389
4390        // Add entries with known vectors
4391        for (i, name) in ["auth", "database", "handler"].iter().enumerate() {
4392            let mut vec = vec![0.0f32; 3];
4393            vec[i] = 1.0; // orthogonal vectors
4394            index.entries.push(EmbeddingEntry {
4395                chunk: SemanticChunk {
4396                    file: PathBuf::from("/src/lib.rs"),
4397                    name: name.to_string(),
4398                    kind: SymbolKind::Function,
4399                    start_line: (i * 10 + 1) as u32,
4400                    end_line: (i * 10 + 5) as u32,
4401                    exported: true,
4402                    embed_text: format!("kind:function name:{}", name),
4403                    snippet: format!("fn {}() {{}}", name),
4404                },
4405                vector: vec,
4406            });
4407        }
4408
4409        // Query aligned with "auth" (index 0)
4410        let query = vec![0.9, 0.1, 0.0];
4411        let results = index.search(&query, 2);
4412
4413        assert_eq!(results.len(), 2);
4414        assert_eq!(results[0].name, "auth"); // highest score
4415        assert!(results[0].score > results[1].score);
4416    }
4417
4418    #[test]
4419    fn test_empty_index_search() {
4420        let index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4421        let results = index.search(&[0.1, 0.2, 0.3], 10);
4422        assert!(results.is_empty());
4423    }
4424
4425    #[test]
4426    fn single_line_symbol_builds_non_empty_snippet() {
4427        let symbol = Symbol {
4428            name: "answer".to_string(),
4429            kind: SymbolKind::Variable,
4430            range: crate::symbols::Range {
4431                start_line: 0,
4432                start_col: 0,
4433                end_line: 0,
4434                end_col: 24,
4435            },
4436            signature: Some("const answer = 42".to_string()),
4437            scope_chain: Vec::new(),
4438            exported: true,
4439            parent: None,
4440        };
4441        let source = "export const answer = 42;\n";
4442
4443        let snippet = build_snippet(&symbol, source);
4444
4445        assert_eq!(snippet, "export const answer = 42;");
4446    }
4447
4448    #[test]
4449    fn optimized_file_chunk_collection_matches_file_parser_path() {
4450        let project_root = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
4451        let file = project_root.join("src/semantic_index.rs");
4452        let source = std::fs::read_to_string(&file).unwrap();
4453
4454        let mut legacy_parser = FileParser::new();
4455        let legacy_symbols = legacy_parser.extract_symbols(&file).unwrap();
4456        let legacy_chunks = symbols_to_chunks(&file, &legacy_symbols, &source, &project_root);
4457
4458        let mut parsers = HashMap::new();
4459        let optimized_chunks = collect_file_chunks(&project_root, &file, &mut parsers).unwrap();
4460
4461        assert_eq!(
4462            chunk_fingerprint(&optimized_chunks),
4463            chunk_fingerprint(&legacy_chunks)
4464        );
4465    }
4466
4467    fn chunk_fingerprint(
4468        chunks: &[SemanticChunk],
4469    ) -> Vec<(String, SymbolKind, u32, u32, bool, String, String)> {
4470        chunks
4471            .iter()
4472            .map(|chunk| {
4473                (
4474                    chunk.name.clone(),
4475                    chunk.kind.clone(),
4476                    chunk.start_line,
4477                    chunk.end_line,
4478                    chunk.exported,
4479                    chunk.embed_text.clone(),
4480                    chunk.snippet.clone(),
4481                )
4482            })
4483            .collect()
4484    }
4485
4486    #[test]
4487    fn collect_file_chunks_skips_oversized_file() {
4488        let dir = tempfile::tempdir().unwrap();
4489        let big = dir.path().join("huge.ts");
4490        // Just over the cap: a valid TS file that would otherwise yield chunks.
4491        let filler = "export const x = 1;\n"
4492            .repeat(((MAX_SEMANTIC_FILE_BYTES as usize) / "export const x = 1;\n".len()) + 16);
4493        std::fs::write(&big, &filler).unwrap();
4494        assert!(big.metadata().unwrap().len() > MAX_SEMANTIC_FILE_BYTES);
4495
4496        let mut parsers = HashMap::new();
4497        // Oversized → tracked with zero chunks, NOT an error (so the caller keeps
4498        // the file in metadata and freshness skips re-reading it).
4499        let chunks = collect_file_chunks(dir.path(), &big, &mut parsers).unwrap();
4500        assert!(chunks.is_empty(), "oversized file must yield no chunks");
4501
4502        // A small file of the same language still produces chunks.
4503        let small = dir.path().join("small.ts");
4504        std::fs::write(&small, "export function foo() { return 1; }\n").unwrap();
4505        let small_chunks = collect_file_chunks(dir.path(), &small, &mut parsers).unwrap();
4506        assert!(!small_chunks.is_empty(), "small file should still chunk");
4507    }
4508
4509    #[test]
4510    fn rejects_oversized_dimension_during_deserialization() {
4511        let mut bytes = Vec::new();
4512        bytes.push(1u8);
4513        bytes.extend_from_slice(&((MAX_DIMENSION as u32) + 1).to_le_bytes());
4514        bytes.extend_from_slice(&0u32.to_le_bytes());
4515        bytes.extend_from_slice(&0u32.to_le_bytes());
4516
4517        assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
4518    }
4519
4520    #[test]
4521    fn rejects_oversized_entry_count_during_deserialization() {
4522        let mut bytes = Vec::new();
4523        bytes.push(1u8);
4524        bytes.extend_from_slice(&(DEFAULT_DIMENSION as u32).to_le_bytes());
4525        bytes.extend_from_slice(&((MAX_ENTRIES as u32) + 1).to_le_bytes());
4526        bytes.extend_from_slice(&0u32.to_le_bytes());
4527
4528        assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
4529    }
4530
4531    #[test]
4532    fn invalidate_file_removes_entries_and_mtime() {
4533        let target = PathBuf::from("/src/main.rs");
4534        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4535        index.entries.push(EmbeddingEntry {
4536            chunk: SemanticChunk {
4537                file: target.clone(),
4538                name: "main".to_string(),
4539                kind: SymbolKind::Function,
4540                start_line: 0,
4541                end_line: 1,
4542                exported: false,
4543                embed_text: "main".to_string(),
4544                snippet: "fn main() {}".to_string(),
4545            },
4546            vector: vec![1.0; DEFAULT_DIMENSION],
4547        });
4548        index
4549            .file_mtimes
4550            .insert(target.clone(), SystemTime::UNIX_EPOCH);
4551        index.file_sizes.insert(target.clone(), 0);
4552
4553        index.invalidate_file(&target);
4554
4555        assert!(index.entries.is_empty());
4556        assert!(!index.file_mtimes.contains_key(&target));
4557        assert!(!index.file_sizes.contains_key(&target));
4558    }
4559
4560    #[test]
4561    fn refresh_missing_changed_file_is_purged_after_collect() {
4562        let temp = tempfile::tempdir().unwrap();
4563        let project_root = temp.path();
4564        let file = project_root.join("src/lib.rs");
4565        fs::create_dir_all(file.parent().unwrap()).unwrap();
4566        write_rust_file(&file, "vanished_symbol");
4567
4568        let mut index = build_test_index(project_root, std::slice::from_ref(&file));
4569        let original_size = *index.file_sizes.get(&file).unwrap();
4570        set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, original_size + 1);
4571        fs::remove_file(&file).unwrap();
4572
4573        let mut embed = test_vector_for_texts;
4574        let mut progress = |_done: usize, _total: usize| {};
4575        let summary = index
4576            .refresh_stale_files(
4577                project_root,
4578                std::slice::from_ref(&file),
4579                &mut embed,
4580                8,
4581                &mut progress,
4582            )
4583            .unwrap();
4584
4585        assert_eq!(summary.changed, 0);
4586        assert_eq!(summary.added, 0);
4587        assert_eq!(summary.deleted, 1);
4588        assert!(index.entries.is_empty());
4589        assert!(!index.file_mtimes.contains_key(&file));
4590        assert!(!index.file_sizes.contains_key(&file));
4591        assert!(!index.file_hashes.contains_key(&file));
4592    }
4593
4594    #[test]
4595    fn refresh_collect_error_for_existing_path_preserves_cached_entry() {
4596        let temp = tempfile::tempdir().unwrap();
4597        let project_root = temp.path();
4598        let file = project_root.join("src/lib.rs");
4599        fs::create_dir_all(file.parent().unwrap()).unwrap();
4600        write_rust_file(&file, "kept_symbol");
4601
4602        let mut index = build_test_index(project_root, std::slice::from_ref(&file));
4603        let original_entry_count = index.entries.len();
4604        let original_mtime = *index.file_mtimes.get(&file).unwrap();
4605        let original_size = *index.file_sizes.get(&file).unwrap();
4606
4607        let stale_mtime = SystemTime::UNIX_EPOCH;
4608        set_file_metadata(&mut index, &file, stale_mtime, original_size + 1);
4609        fs::remove_file(&file).unwrap();
4610        fs::create_dir(&file).unwrap();
4611
4612        let mut embed = test_vector_for_texts;
4613        let mut progress = |_done: usize, _total: usize| {};
4614        let summary = index
4615            .refresh_stale_files(
4616                project_root,
4617                std::slice::from_ref(&file),
4618                &mut embed,
4619                8,
4620                &mut progress,
4621            )
4622            .unwrap();
4623
4624        assert_eq!(summary.changed, 0);
4625        assert_eq!(summary.added, 0);
4626        assert_eq!(summary.deleted, 0);
4627        assert_eq!(index.entries.len(), original_entry_count);
4628        assert!(index
4629            .entries
4630            .iter()
4631            .any(|entry| entry.chunk.name == "kept_symbol"));
4632        assert_eq!(index.file_mtimes.get(&file), Some(&stale_mtime));
4633        assert_ne!(index.file_mtimes.get(&file), Some(&original_mtime));
4634        assert_eq!(index.file_sizes.get(&file), Some(&(original_size + 1)));
4635    }
4636
4637    #[test]
4638    fn refresh_never_indexed_file_error_does_not_record_mtime() {
4639        let temp = tempfile::tempdir().unwrap();
4640        let project_root = temp.path();
4641        let missing = project_root.join("src/missing.rs");
4642        fs::create_dir_all(missing.parent().unwrap()).unwrap();
4643
4644        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
4645        let mut embed = test_vector_for_texts;
4646        let mut progress = |_done: usize, _total: usize| {};
4647        let summary = index
4648            .refresh_stale_files(
4649                project_root,
4650                std::slice::from_ref(&missing),
4651                &mut embed,
4652                8,
4653                &mut progress,
4654            )
4655            .unwrap();
4656
4657        assert_eq!(summary.added, 0);
4658        assert_eq!(summary.changed, 0);
4659        assert_eq!(summary.deleted, 0);
4660        assert!(!index.file_mtimes.contains_key(&missing));
4661        assert!(!index.file_sizes.contains_key(&missing));
4662        assert!(index.entries.is_empty());
4663    }
4664
4665    #[test]
4666    fn refresh_reports_added_for_new_files() {
4667        let temp = tempfile::tempdir().unwrap();
4668        let project_root = temp.path();
4669        let existing = project_root.join("src/lib.rs");
4670        let added = project_root.join("src/new.rs");
4671        fs::create_dir_all(existing.parent().unwrap()).unwrap();
4672        write_rust_file(&existing, "existing_symbol");
4673        write_rust_file(&added, "added_symbol");
4674
4675        let mut index = build_test_index(project_root, std::slice::from_ref(&existing));
4676        let mut embed = test_vector_for_texts;
4677        let mut progress = |_done: usize, _total: usize| {};
4678        let summary = index
4679            .refresh_stale_files(
4680                project_root,
4681                &[existing.clone(), added.clone()],
4682                &mut embed,
4683                8,
4684                &mut progress,
4685            )
4686            .unwrap();
4687
4688        assert_eq!(summary.added, 1);
4689        assert_eq!(summary.changed, 0);
4690        assert_eq!(summary.deleted, 0);
4691        assert_eq!(summary.total_processed, 2);
4692        assert!(index.file_mtimes.contains_key(&added));
4693        assert!(index.entries.iter().any(|entry| entry.chunk.file == added));
4694    }
4695
4696    #[test]
4697    fn refresh_reports_deleted_for_removed_files() {
4698        let temp = tempfile::tempdir().unwrap();
4699        let project_root = temp.path();
4700        let deleted = project_root.join("src/deleted.rs");
4701        fs::create_dir_all(deleted.parent().unwrap()).unwrap();
4702        write_rust_file(&deleted, "deleted_symbol");
4703
4704        let mut index = build_test_index(project_root, std::slice::from_ref(&deleted));
4705        fs::remove_file(&deleted).unwrap();
4706
4707        let mut embed = test_vector_for_texts;
4708        let mut progress = |_done: usize, _total: usize| {};
4709        let summary = index
4710            .refresh_stale_files(project_root, &[], &mut embed, 8, &mut progress)
4711            .unwrap();
4712
4713        assert_eq!(summary.deleted, 1);
4714        assert_eq!(summary.changed, 0);
4715        assert_eq!(summary.added, 0);
4716        assert_eq!(summary.total_processed, 1);
4717        assert!(!index.file_mtimes.contains_key(&deleted));
4718        assert!(index.entries.is_empty());
4719    }
4720
4721    #[test]
4722    fn refresh_reports_changed_for_modified_files() {
4723        let temp = tempfile::tempdir().unwrap();
4724        let project_root = temp.path();
4725        let file = project_root.join("src/lib.rs");
4726        fs::create_dir_all(file.parent().unwrap()).unwrap();
4727        write_rust_file(&file, "old_symbol");
4728
4729        let mut index = build_test_index(project_root, std::slice::from_ref(&file));
4730        set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, 0);
4731        write_rust_file(&file, "new_symbol");
4732
4733        let mut embed = test_vector_for_texts;
4734        let mut progress = |_done: usize, _total: usize| {};
4735        let summary = index
4736            .refresh_stale_files(
4737                project_root,
4738                std::slice::from_ref(&file),
4739                &mut embed,
4740                8,
4741                &mut progress,
4742            )
4743            .unwrap();
4744
4745        assert_eq!(summary.changed, 1);
4746        assert_eq!(summary.added, 0);
4747        assert_eq!(summary.deleted, 0);
4748        assert_eq!(summary.total_processed, 1);
4749        assert!(index
4750            .entries
4751            .iter()
4752            .any(|entry| entry.chunk.name == "new_symbol"));
4753        assert!(!index
4754            .entries
4755            .iter()
4756            .any(|entry| entry.chunk.name == "old_symbol"));
4757    }
4758
4759    #[test]
4760    fn refresh_all_clean_reports_zero_counts_and_no_embedding_work() {
4761        let temp = tempfile::tempdir().unwrap();
4762        let project_root = temp.path();
4763        let file = project_root.join("src/lib.rs");
4764        fs::create_dir_all(file.parent().unwrap()).unwrap();
4765        write_rust_file(&file, "clean_symbol");
4766
4767        let mut index = build_test_index(project_root, std::slice::from_ref(&file));
4768        let original_entries = index.entries.len();
4769        let mut embed_called = false;
4770        let mut embed = |texts: Vec<String>| {
4771            embed_called = true;
4772            test_vector_for_texts(texts)
4773        };
4774        let mut progress = |_done: usize, _total: usize| {};
4775        let summary = index
4776            .refresh_stale_files(
4777                project_root,
4778                std::slice::from_ref(&file),
4779                &mut embed,
4780                8,
4781                &mut progress,
4782            )
4783            .unwrap();
4784
4785        assert!(summary.is_noop());
4786        assert_eq!(summary.total_processed, 1);
4787        assert!(!embed_called);
4788        assert_eq!(index.entries.len(), original_entries);
4789    }
4790
4791    #[test]
4792    fn detects_missing_onnx_runtime_from_dynamic_load_error() {
4793        let message = "Failed to load ONNX Runtime shared library libonnxruntime.dylib via dlopen: no such file";
4794
4795        assert!(is_onnx_runtime_unavailable(message));
4796    }
4797
4798    #[test]
4799    fn formats_missing_onnx_runtime_with_install_hint() {
4800        let message = format_embedding_init_error(
4801            "Failed to load ONNX Runtime shared library libonnxruntime.so via dlopen: no such file",
4802        );
4803
4804        assert!(message.starts_with("ONNX Runtime not found. Install via:"));
4805        assert!(message.contains("Original error:"));
4806    }
4807
4808    #[test]
4809    fn openai_compatible_backend_embeds_with_mock_server() {
4810        let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
4811            assert!(request_line.starts_with("POST "));
4812            assert_eq!(path, "/v1/embeddings");
4813            "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0},{\"embedding\":[0.4,0.5,0.6],\"index\":1}]}".to_string()
4814        });
4815
4816        let config = SemanticBackendConfig {
4817            backend: SemanticBackend::OpenAiCompatible,
4818            model: "test-embedding".to_string(),
4819            base_url: Some(base_url),
4820            api_key_env: None,
4821            timeout_ms: 5_000,
4822            max_batch_size: 64,
4823            max_files: 20_000,
4824        };
4825
4826        let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
4827        let vectors = model
4828            .embed(vec!["hello".to_string(), "world".to_string()])
4829            .unwrap();
4830
4831        assert_eq!(vectors, vec![vec![0.1, 0.2, 0.3], vec![0.4, 0.5, 0.6]]);
4832        handle.join().unwrap();
4833    }
4834
4835    /// Regression for issue #36: AFT was sending TWO Content-Type headers
4836    /// on the OpenAI embeddings request — once implicitly via `.json(&body)`
4837    /// and again explicitly via `.header("Content-Type", "application/json")`.
4838    /// reqwest's `.header()` calls `HeaderMap::append`, which produces two
4839    /// headers on the wire. OpenAI's /v1/embeddings endpoint rejects that
4840    /// with `HTTP 400 "you must provide a model parameter"` even though the
4841    /// body actually contains `model`. The fix is to drop the explicit
4842    /// `.header("Content-Type", ...)` call. This test pins that we send
4843    /// exactly one Content-Type header.
4844    #[test]
4845    fn openai_compatible_request_has_single_content_type_header() {
4846        use std::sync::{Arc, Mutex};
4847        let captured: Arc<Mutex<Vec<u8>>> = Arc::new(Mutex::new(Vec::new()));
4848        let captured_for_thread = Arc::clone(&captured);
4849
4850        let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
4851        let addr = listener.local_addr().expect("local addr");
4852        let handle = thread::spawn(move || {
4853            let (mut stream, _) = listener.accept().expect("accept");
4854            let mut buf = Vec::new();
4855            let mut chunk = [0u8; 4096];
4856            let mut header_end = None;
4857            let mut content_length = 0usize;
4858            loop {
4859                let n = stream.read(&mut chunk).expect("read");
4860                if n == 0 {
4861                    break;
4862                }
4863                buf.extend_from_slice(&chunk[..n]);
4864                if header_end.is_none() {
4865                    if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
4866                        header_end = Some(pos + 4);
4867                        for line in String::from_utf8_lossy(&buf[..pos + 4]).lines() {
4868                            if let Some(value) = line.strip_prefix("Content-Length:") {
4869                                content_length = value.trim().parse::<usize>().unwrap_or(0);
4870                            }
4871                        }
4872                    }
4873                }
4874                if let Some(end) = header_end {
4875                    if buf.len() >= end + content_length {
4876                        break;
4877                    }
4878                }
4879            }
4880            *captured_for_thread.lock().unwrap() = buf;
4881            let body = "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0}]}";
4882            let response = format!(
4883                "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
4884                body.len(),
4885                body
4886            );
4887            let _ = stream.write_all(response.as_bytes());
4888        });
4889
4890        let config = SemanticBackendConfig {
4891            backend: SemanticBackend::OpenAiCompatible,
4892            model: "text-embedding-3-small".to_string(),
4893            base_url: Some(format!("http://{}", addr)),
4894            api_key_env: None,
4895            timeout_ms: 5_000,
4896            max_batch_size: 64,
4897            max_files: 20_000,
4898        };
4899        let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
4900        let _ = model.embed(vec!["probe".to_string()]).unwrap();
4901        handle.join().unwrap();
4902
4903        let bytes = captured.lock().unwrap().clone();
4904        let request = String::from_utf8_lossy(&bytes);
4905
4906        // Lowercase line counts because HTTP headers are case-insensitive
4907        // and reqwest may emit `content-type` in lowercase under HTTP/2.
4908        let content_type_lines = request
4909            .lines()
4910            .filter(|line| {
4911                let lower = line.to_ascii_lowercase();
4912                lower.starts_with("content-type:")
4913            })
4914            .count();
4915        assert_eq!(
4916            content_type_lines, 1,
4917            "expected exactly one Content-Type header but found {content_type_lines}; full request:\n{request}",
4918        );
4919
4920        // The body must still include the model field — pin this so a future
4921        // change can't accidentally drop `model` while fixing duplicate headers.
4922        assert!(
4923            request.contains(r#""model":"text-embedding-3-small""#),
4924            "request body should contain model field; full request:\n{request}",
4925        );
4926    }
4927
4928    #[test]
4929    fn ollama_backend_embeds_with_mock_server() {
4930        let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
4931            assert!(request_line.starts_with("POST "));
4932            assert_eq!(path, "/api/embed");
4933            "{\"embeddings\":[[0.7,0.8,0.9],[1.0,1.1,1.2]]}".to_string()
4934        });
4935
4936        let config = SemanticBackendConfig {
4937            backend: SemanticBackend::Ollama,
4938            model: "embeddinggemma".to_string(),
4939            base_url: Some(base_url),
4940            api_key_env: None,
4941            timeout_ms: 5_000,
4942            max_batch_size: 64,
4943            max_files: 20_000,
4944        };
4945
4946        let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
4947        let vectors = model
4948            .embed(vec!["hello".to_string(), "world".to_string()])
4949            .unwrap();
4950
4951        assert_eq!(vectors, vec![vec![0.7, 0.8, 0.9], vec![1.0, 1.1, 1.2]]);
4952        handle.join().unwrap();
4953    }
4954
4955    #[test]
4956    fn read_from_disk_rejects_fingerprint_mismatch() {
4957        let storage = tempfile::tempdir().unwrap();
4958        let project_key = "proj";
4959
4960        let project_root = test_project_root();
4961        let file = project_root.join("src/main.rs");
4962        let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
4963        index.entries.push(EmbeddingEntry {
4964            chunk: SemanticChunk {
4965                file: file.clone(),
4966                name: "handle_request".to_string(),
4967                kind: SymbolKind::Function,
4968                start_line: 10,
4969                end_line: 25,
4970                exported: true,
4971                embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
4972                snippet: "fn handle_request() {}".to_string(),
4973            },
4974            vector: vec![0.1, 0.2, 0.3],
4975        });
4976        index.dimension = 3;
4977        index
4978            .file_mtimes
4979            .insert(file.clone(), SystemTime::UNIX_EPOCH);
4980        index.file_sizes.insert(file, 0);
4981        index.set_fingerprint(SemanticIndexFingerprint {
4982            backend: "openai_compatible".to_string(),
4983            model: "test-embedding".to_string(),
4984            base_url: "http://127.0.0.1:1234/v1".to_string(),
4985            dimension: 3,
4986            chunking_version: default_chunking_version(),
4987        });
4988        index.write_to_disk(storage.path(), project_key);
4989
4990        let matching = index.fingerprint().unwrap().as_string();
4991        assert!(SemanticIndex::read_from_disk(
4992            storage.path(),
4993            project_key,
4994            &project_root,
4995            false,
4996            Some(&matching),
4997        )
4998        .is_some());
4999
5000        let mismatched = SemanticIndexFingerprint {
5001            backend: "ollama".to_string(),
5002            model: "embeddinggemma".to_string(),
5003            base_url: "http://127.0.0.1:11434".to_string(),
5004            dimension: 3,
5005            chunking_version: default_chunking_version(),
5006        }
5007        .as_string();
5008        assert!(SemanticIndex::read_from_disk(
5009            storage.path(),
5010            project_key,
5011            &project_root,
5012            false,
5013            Some(&mismatched),
5014        )
5015        .is_none());
5016    }
5017
5018    #[test]
5019    fn read_from_disk_rejects_v3_cache_for_snippet_rebuild() {
5020        let storage = tempfile::tempdir().unwrap();
5021        let project_key = "proj-v3";
5022        let dir = storage.path().join("semantic").join(project_key);
5023        fs::create_dir_all(&dir).unwrap();
5024
5025        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
5026        index.entries.push(EmbeddingEntry {
5027            chunk: SemanticChunk {
5028                file: PathBuf::from("/src/main.rs"),
5029                name: "handle_request".to_string(),
5030                kind: SymbolKind::Function,
5031                start_line: 0,
5032                end_line: 0,
5033                exported: true,
5034                embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
5035                snippet: "fn handle_request() {}".to_string(),
5036            },
5037            vector: vec![0.1, 0.2, 0.3],
5038        });
5039        index.dimension = 3;
5040        index
5041            .file_mtimes
5042            .insert(PathBuf::from("/src/main.rs"), SystemTime::UNIX_EPOCH);
5043        index.file_sizes.insert(PathBuf::from("/src/main.rs"), 0);
5044        let fingerprint = SemanticIndexFingerprint {
5045            backend: "fastembed".to_string(),
5046            model: "test".to_string(),
5047            base_url: FALLBACK_BACKEND.to_string(),
5048            dimension: 3,
5049            chunking_version: default_chunking_version(),
5050        };
5051        index.set_fingerprint(fingerprint.clone());
5052
5053        let mut bytes = index.to_bytes();
5054        bytes[0] = SEMANTIC_INDEX_VERSION_V3;
5055        fs::write(dir.join("semantic.bin"), bytes).unwrap();
5056
5057        assert!(SemanticIndex::read_from_disk(
5058            storage.path(),
5059            project_key,
5060            &test_project_root(),
5061            false,
5062            Some(&fingerprint.as_string())
5063        )
5064        .is_none());
5065        assert!(!dir.join("semantic.bin").exists());
5066    }
5067
5068    fn make_symbol(kind: SymbolKind, name: &str, start: u32, end: u32) -> crate::symbols::Symbol {
5069        crate::symbols::Symbol {
5070            name: name.to_string(),
5071            kind,
5072            range: crate::symbols::Range {
5073                start_line: start,
5074                start_col: 0,
5075                end_line: end,
5076                end_col: 0,
5077            },
5078            signature: None,
5079            scope_chain: Vec::new(),
5080            exported: false,
5081            parent: None,
5082        }
5083    }
5084
5085    /// Heading symbols (Markdown / HTML headings) must NOT be indexed —
5086    /// they overwhelmingly dominated semantic results even on code-shaped
5087    /// queries because heading prose embeds far more strongly than code
5088    /// chunks. Skipping headings keeps aft_search a code-finder.
5089    #[test]
5090    fn symbols_to_chunks_skips_heading_symbols() {
5091        let project_root = PathBuf::from("/proj");
5092        let file = project_root.join("README.md");
5093        let source = "# Title\n\nbody text\n\n## Section\n\nmore text\n";
5094
5095        let symbols = vec![
5096            make_symbol(SymbolKind::Heading, "Title", 0, 2),
5097            make_symbol(SymbolKind::Heading, "Section", 4, 6),
5098        ];
5099
5100        let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
5101        assert!(
5102            chunks.is_empty(),
5103            "Heading symbols must be filtered out before embedding; got {} chunk(s)",
5104            chunks.len()
5105        );
5106    }
5107
5108    /// A symbol with an enormous signature (e.g. a YAML/Kubernetes CronJob
5109    /// whose inline `command:` script is parsed into the signature) must not
5110    /// produce an embed_text that overflows the embedding backend's physical
5111    /// batch. Before the clamp, the unbounded `signature:` append created a
5112    /// multi-KB input that aborted the whole index build and degraded every
5113    /// search to lexical-only.
5114    #[test]
5115    fn build_embed_text_clamps_oversized_signature() {
5116        let project_root = PathBuf::from("/proj");
5117        let file = project_root.join("cronjob.yaml");
5118        let huge_sig = "kubectl ".repeat(2000); // ~16 KB
5119        let source = "apiVersion: batch/v1\nkind: CronJob\n";
5120
5121        let mut symbol = make_symbol(SymbolKind::Class, "cluster-janitor", 0, 1);
5122        symbol.signature = Some(huge_sig);
5123
5124        let text = build_embed_text(&symbol, source, &file, &project_root);
5125        assert!(
5126            text.chars().count() <= MAX_EMBED_TEXT_CHARS,
5127            "embed_text must be clamped to {} chars, got {}",
5128            MAX_EMBED_TEXT_CHARS,
5129            text.chars().count()
5130        );
5131    }
5132
5133    /// Code symbols (functions, classes, methods, structs, etc.) must still
5134    /// be indexed alongside the heading skip — otherwise we'd starve the
5135    /// index entirely.
5136    #[test]
5137    fn symbols_to_chunks_keeps_code_symbols_alongside_skipped_headings() {
5138        let project_root = PathBuf::from("/proj");
5139        let file = project_root.join("src/lib.rs");
5140        let source = "pub fn handle_request() -> bool {\n    true\n}\n";
5141
5142        let symbols = vec![
5143            // A heading mixed in (e.g. from a doc comment block elsewhere).
5144            make_symbol(SymbolKind::Heading, "doc heading", 0, 1),
5145            make_symbol(SymbolKind::Function, "handle_request", 0, 2),
5146            make_symbol(SymbolKind::Struct, "AuthService", 4, 6),
5147        ];
5148
5149        let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
5150        assert_eq!(
5151            chunks.len(),
5152            3,
5153            "Expected file-summary + 2 code chunks (Function + Struct), got {}",
5154            chunks.len()
5155        );
5156        let names: Vec<&str> = chunks.iter().map(|c| c.name.as_str()).collect();
5157        assert!(chunks
5158            .iter()
5159            .any(|chunk| matches!(chunk.kind, SymbolKind::FileSummary)));
5160        assert!(names.contains(&"handle_request"));
5161        assert!(names.contains(&"AuthService"));
5162        assert!(
5163            !names.contains(&"doc heading"),
5164            "Heading symbol leaked into chunks: {names:?}"
5165        );
5166    }
5167
5168    #[test]
5169    fn validate_ssrf_allows_loopback_hostnames() {
5170        // Loopback hostnames are explicitly allowed so self-hosted backends
5171        // (Ollama at http://localhost:11434) work at their default config.
5172        for host in &[
5173            "http://localhost",
5174            "http://localhost:8080",
5175            "http://localhost:11434", // Ollama default
5176            "http://localhost.localdomain",
5177            "http://foo.localhost",
5178        ] {
5179            assert!(
5180                validate_base_url_no_ssrf(host).is_ok(),
5181                "Expected {host} to be allowed (loopback), got: {:?}",
5182                validate_base_url_no_ssrf(host)
5183            );
5184        }
5185    }
5186
5187    #[test]
5188    fn validate_ssrf_allows_loopback_ips() {
5189        // 127.0.0.0/8 is loopback — by definition same-machine and not an
5190        // SSRF target. Allow it so Ollama at http://127.0.0.1:11434 works.
5191        for url in &[
5192            "http://127.0.0.1",
5193            "http://127.0.0.1:11434", // Ollama default
5194            "http://127.0.0.1:8080",
5195            "http://127.1.2.3",
5196        ] {
5197            let result = validate_base_url_no_ssrf(url);
5198            assert!(
5199                result.is_ok(),
5200                "Expected {url} to be allowed (loopback), got: {:?}",
5201                result
5202            );
5203        }
5204    }
5205
5206    #[test]
5207    fn validate_ssrf_rejects_private_non_loopback_ips() {
5208        // Non-loopback private/reserved IPs remain rejected — homelab/intranet
5209        // services on LAN IPs are real SSRF targets even though the user
5210        // configured them. Users who want this can opt in by binding the
5211        // service to a public-routable address.
5212        for url in &[
5213            "http://192.168.1.1",
5214            "http://10.0.0.1",
5215            "http://172.16.0.1",
5216            "http://169.254.169.254",
5217            "http://100.64.0.1",
5218        ] {
5219            let result = validate_base_url_no_ssrf(url);
5220            assert!(
5221                result.is_err(),
5222                "Expected {url} to be rejected (non-loopback private), got: {:?}",
5223                result
5224            );
5225        }
5226    }
5227
5228    #[test]
5229    fn validate_ssrf_rejects_mdns_local_hostnames() {
5230        // mDNS .local hostnames typically resolve to LAN devices, not
5231        // loopback. Rejecting them before DNS lookup gives a clearer error.
5232        for host in &[
5233            "http://printer.local",
5234            "http://nas.local:8080",
5235            "http://homelab.local",
5236        ] {
5237            let result = validate_base_url_no_ssrf(host);
5238            assert!(
5239                result.is_err(),
5240                "Expected {host} to be rejected (mDNS), got: {:?}",
5241                result
5242            );
5243        }
5244    }
5245
5246    #[test]
5247    fn normalize_base_url_allows_localhost_for_tests() {
5248        // normalize_base_url itself should NOT block localhost — only
5249        // validate_base_url_no_ssrf does. Tests construct backends directly.
5250        assert!(normalize_base_url("http://127.0.0.1:9999").is_ok());
5251        assert!(normalize_base_url("http://localhost:8080").is_ok());
5252    }
5253
5254    #[test]
5255    fn ssrf_guard_blocks_reserved_ranges_but_allows_loopback() {
5256        use std::net::IpAddr;
5257        let blocked = |s: &str| is_private_non_loopback_ip(&s.parse::<IpAddr>().unwrap());
5258
5259        // Private / link-local / CGNAT — blocked (unchanged behavior).
5260        assert!(blocked("10.0.0.1"));
5261        assert!(blocked("192.168.1.1"));
5262        assert!(blocked("169.254.0.1"));
5263        assert!(blocked("100.64.0.1"));
5264        // Newly covered by delegating to url_fetch's complete list:
5265        assert!(
5266            blocked("198.18.0.1"),
5267            "RFC2544 benchmark range must be blocked"
5268        );
5269        assert!(blocked("224.0.0.1"), "multicast must be blocked");
5270        assert!(blocked("fc00::1"), "IPv6 ULA must be blocked");
5271        assert!(blocked("fe80::1"), "IPv6 link-local must be blocked");
5272
5273        // Loopback — allowed (local Ollama endpoint), incl. IPv4-mapped form.
5274        assert!(!blocked("127.0.0.1"), "loopback must stay allowed");
5275        assert!(!blocked("::1"), "IPv6 loopback must stay allowed");
5276        assert!(
5277            !blocked("::ffff:127.0.0.1"),
5278            "IPv4-mapped loopback must stay allowed (matches prior carve-out)"
5279        );
5280
5281        // A public address must NOT be flagged.
5282        assert!(!blocked("8.8.8.8"));
5283    }
5284
5285    /// Pin the user-facing wording of the ONNX version-mismatch error.
5286    /// The auto-fix path MUST be listed first because it's the only safe
5287    /// option that doesn't require sudo or risk breaking other apps that
5288    /// link the system library. Regression of any of these strings would
5289    /// either mislead users (system rm before auto-fix) or break the
5290    /// `aft doctor --fix` discovery path.
5291    #[test]
5292    fn ort_mismatch_message_recommends_auto_fix_first() {
5293        let msg =
5294            format_ort_version_mismatch("1.9.0", "/usr/lib/x86_64-linux-gnu/libonnxruntime.so");
5295
5296        // The reported version and path must appear verbatim.
5297        assert!(
5298            msg.contains("v1.9.0"),
5299            "should report detected version: {msg}"
5300        );
5301        assert!(
5302            msg.contains("/usr/lib/x86_64-linux-gnu/libonnxruntime.so"),
5303            "should report system path: {msg}"
5304        );
5305        assert!(msg.contains("v1.20+"), "should state requirement: {msg}");
5306
5307        // Solution ordering: auto-fix is #1, system rm is #2, install is #3.
5308        let auto_fix_pos = msg
5309            .find("Auto-fix")
5310            .expect("Auto-fix solution missing — users won't discover --fix");
5311        let remove_pos = msg
5312            .find("Remove the old library")
5313            .expect("system-rm solution missing");
5314        assert!(
5315            auto_fix_pos < remove_pos,
5316            "Auto-fix must come before manual rm — see PR comment thread"
5317        );
5318
5319        // The auto-fix command must be runnable as-is on a fresh system.
5320        assert!(
5321            msg.contains("npx @cortexkit/aft doctor --fix"),
5322            "auto-fix command must be present and copy-pasteable: {msg}"
5323        );
5324    }
5325
5326    #[cfg(any(target_os = "linux", target_os = "macos"))]
5327    #[test]
5328    fn loaded_ort_version_detection_prefers_actual_loaded_library_path() {
5329        let requested = "libonnxruntime.so";
5330        let actual = "/usr/local/lib/libonnxruntime.so.1.19.0";
5331
5332        assert_eq!(detect_ort_version_from_path(requested), None);
5333        let (version, source) =
5334            detect_ort_version_from_resolved_or_requested(Some(actual.to_string()), requested);
5335
5336        assert_eq!(version, Some("1.19.0".to_string()));
5337        assert_eq!(source, actual);
5338
5339        let msg = format_ort_version_mismatch(&version.unwrap(), &source);
5340        assert!(msg.contains("v1.19.0"));
5341        assert!(msg.contains(actual));
5342    }
5343
5344    /// macOS dylib paths must not produce a malformed message when the
5345    /// system path lacks a trailing slash. This is a regression guard
5346    /// for the "{}\n{}" format string contract.
5347    #[test]
5348    fn ort_mismatch_message_handles_macos_dylib_path() {
5349        let msg = format_ort_version_mismatch("1.9.0", "/opt/homebrew/lib/libonnxruntime.dylib");
5350        assert!(msg.contains("v1.9.0"));
5351        assert!(msg.contains("/opt/homebrew/lib/libonnxruntime.dylib"));
5352        // The dylib path must appear in the auto-fix paragraph (single
5353        // quotes around it) AND in the manual-rm paragraph; verify
5354        // both placements survived the format string.
5355        assert!(
5356            msg.contains("'/opt/homebrew/lib/libonnxruntime.dylib'"),
5357            "system path should be quoted in the auto-fix sentence: {msg}"
5358        );
5359    }
5360}