aft/
semantic_index.rs

1use crate::cache_freshness::{self, FileFreshness, FreshnessVerdict};
2use crate::config::{SemanticBackend, SemanticBackendConfig};
3use crate::fs_lock;
4use crate::parser::{detect_language, extract_symbols_from_tree, grammar_for};
5use crate::search_index::{cache_relative_path, cached_path_under_root};
6use crate::symbols::{Symbol, SymbolKind};
7use crate::{slog_info, slog_warn};
8
9use fastembed::{EmbeddingModel as FastembedEmbeddingModel, InitOptions, TextEmbedding};
10use rayon::prelude::*;
11use reqwest::blocking::Client;
12use serde::{Deserialize, Serialize};
13use std::collections::{HashMap, HashSet, VecDeque};
14use std::env;
15use std::fmt::Display;
16use std::fs;
17use std::path::{Path, PathBuf};
18use std::sync::Mutex;
19use std::time::Duration;
20use std::time::SystemTime;
21use tree_sitter::Parser;
22use url::Url;
23
24const DEFAULT_DIMENSION: usize = 384;
25const MAX_ENTRIES: usize = 1_000_000;
26// Covers high-dimensional backends such as OpenAI text-embedding-3-large (3072)
27// and common local models (4096) while keeping a bounded supported shape.
28const MAX_DIMENSION: usize = 4096;
29const F32_BYTES: usize = std::mem::size_of::<f32>();
30const HEADER_BYTES_V1: usize = 9;
31const HEADER_BYTES_V2: usize = 13;
32const ONNX_RUNTIME_INSTALL_HINT: &str =
33    "ONNX Runtime not found. Install via: brew install onnxruntime (macOS), \
34     apt install libonnxruntime (Linux), or place onnxruntime.dll in your PATH (Windows). \
35     AFT can auto-download ONNX Runtime — run `npx @cortexkit/aft doctor` to diagnose.";
36
37const SEMANTIC_INDEX_VERSION_V1: u8 = 1;
38const SEMANTIC_INDEX_VERSION_V2: u8 = 2;
39/// V3 adds subsec_nanos to the file-mtime table so staleness detection survives
40/// restart round-trips on filesystems with subsecond mtime precision (APFS,
41/// ext4 with nsec, NTFS). V1/V2 persisted whole-second mtimes only, which
42/// caused every restart to flag ~99% of files as stale and re-embed them.
43const SEMANTIC_INDEX_VERSION_V3: u8 = 3;
44/// V4 keeps the V3 on-disk layout but rebuilds persisted snippets once after
45/// fixing symbol ranges that were incorrectly treated as 1-based.
46const SEMANTIC_INDEX_VERSION_V4: u8 = 4;
47/// V5 adds file sizes to the file metadata table so incremental staleness
48/// detection can catch content changes even when mtime precision misses them.
49const SEMANTIC_INDEX_VERSION_V5: u8 = 5;
50/// V6 stores paths relative to project_root and adds content hashes.
51const SEMANTIC_INDEX_VERSION_V6: u8 = 6;
52const DEFAULT_OPENAI_EMBEDDING_PATH: &str = "/embeddings";
53const DEFAULT_OLLAMA_EMBEDDING_PATH: &str = "/api/embed";
54// Must stay below the bridge timeout (30s) to avoid bridge kills on slow backends.
55const DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS: u64 = 25_000;
56const DEFAULT_MAX_BATCH_SIZE: usize = 64;
57const QUERY_EMBEDDING_CACHE_CAP: usize = 1_000;
58const FALLBACK_BACKEND: &str = "none";
59const EMBEDDING_REQUEST_MAX_ATTEMPTS: usize = 3;
60const EMBEDDING_REQUEST_BACKOFF_MS: [u64; 2] = [500, 1_000];
61static SEMANTIC_LOCK_ACQUIRE_MUTEX: Mutex<()> = Mutex::new(());
62
63pub struct SemanticIndexLock {
64    _guard: fs_lock::LockGuard,
65}
66
67impl SemanticIndexLock {
68    pub fn acquire(storage_dir: &Path, project_key: &str) -> std::io::Result<Self> {
69        let dir = storage_dir.join("semantic").join(project_key);
70        fs::create_dir_all(&dir)?;
71        let path = dir.join("cache.lock");
72        let _acquire_guard = SEMANTIC_LOCK_ACQUIRE_MUTEX
73            .lock()
74            .map_err(|_| std::io::Error::other("semantic cache lock acquisition mutex poisoned"))?;
75        fs_lock::try_acquire(&path, Duration::from_secs(2))
76            .map(|guard| Self { _guard: guard })
77            .map_err(|error| match error {
78                fs_lock::AcquireError::Timeout => {
79                    std::io::Error::other("timed out acquiring semantic cache lock")
80                }
81                fs_lock::AcquireError::Io(error) => error,
82            })
83    }
84}
85
86#[derive(Debug, Clone, Serialize, Deserialize)]
87pub struct SemanticIndexFingerprint {
88    pub backend: String,
89    pub model: String,
90    #[serde(default)]
91    pub base_url: String,
92    pub dimension: usize,
93    #[serde(default = "default_chunking_version")]
94    pub chunking_version: u32,
95}
96
97fn default_chunking_version() -> u32 {
98    2
99}
100
101impl SemanticIndexFingerprint {
102    fn from_config(config: &SemanticBackendConfig, dimension: usize) -> Self {
103        // Use normalized URL for fingerprinting so cosmetic differences
104        // (e.g. "http://host/v1" vs "http://host/v1/") don't cause rebuilds.
105        let base_url = config
106            .base_url
107            .as_ref()
108            .and_then(|u| normalize_base_url(u).ok())
109            .unwrap_or_else(|| FALLBACK_BACKEND.to_string());
110        Self {
111            backend: config.backend.as_str().to_string(),
112            model: config.model.clone(),
113            base_url,
114            dimension,
115            chunking_version: default_chunking_version(),
116        }
117    }
118
119    pub fn as_string(&self) -> String {
120        serde_json::to_string(self).unwrap_or_else(|_| String::new())
121    }
122
123    fn matches_expected(&self, expected: &str) -> bool {
124        let encoded = self.as_string();
125        !encoded.is_empty() && encoded == expected
126    }
127}
128
129enum SemanticEmbeddingEngine {
130    Fastembed(TextEmbedding),
131    OpenAiCompatible {
132        client: Client,
133        model: String,
134        base_url: String,
135        api_key: Option<String>,
136    },
137    Ollama {
138        client: Client,
139        model: String,
140        base_url: String,
141    },
142}
143
144pub struct SemanticEmbeddingModel {
145    backend: SemanticBackend,
146    model: String,
147    base_url: Option<String>,
148    timeout_ms: u64,
149    max_batch_size: usize,
150    dimension: Option<usize>,
151    engine: SemanticEmbeddingEngine,
152    query_embedding_cache: HashMap<String, Vec<f32>>,
153    query_embedding_cache_order: VecDeque<String>,
154    query_embedding_cache_hits: u64,
155    query_embedding_cache_misses: u64,
156}
157
158pub type EmbeddingModel = SemanticEmbeddingModel;
159
160fn validate_embedding_batch(
161    vectors: &[Vec<f32>],
162    expected_count: usize,
163    context: &str,
164) -> Result<(), String> {
165    if expected_count > 0 && vectors.is_empty() {
166        return Err(format!(
167            "{context} returned no vectors for {expected_count} inputs"
168        ));
169    }
170
171    if vectors.len() != expected_count {
172        return Err(format!(
173            "{context} returned {} vectors for {} inputs",
174            vectors.len(),
175            expected_count
176        ));
177    }
178
179    let Some(first_vector) = vectors.first() else {
180        return Ok(());
181    };
182    let expected_dimension = first_vector.len();
183    validate_embedding_dimension(expected_dimension)
184        .map_err(|error| format!("{context} returned {error}"))?;
185    for (index, vector) in vectors.iter().enumerate() {
186        if vector.len() != expected_dimension {
187            return Err(format!(
188                "{context} returned inconsistent embedding dimensions: vector 0 has length {expected_dimension}, vector {index} has length {}",
189                vector.len()
190            ));
191        }
192    }
193
194    Ok(())
195}
196
197fn validate_embedding_dimension(dimension: usize) -> Result<(), String> {
198    if dimension == 0 || dimension > MAX_DIMENSION {
199        return Err(format!(
200            "invalid embedding dimension: {dimension}; supported range is 1..={MAX_DIMENSION}"
201        ));
202    }
203
204    Ok(())
205}
206
207/// Normalize a base URL: validate scheme and strip trailing slash.
208/// Does NOT perform SSRF/private-IP validation — call
209/// `validate_base_url_no_ssrf` separately when processing user-supplied config.
210fn normalize_base_url(raw: &str) -> Result<String, String> {
211    let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
212    let scheme = parsed.scheme();
213    if scheme != "http" && scheme != "https" {
214        return Err(format!(
215            "unsupported URL scheme '{}' — only http:// and https:// are allowed",
216            scheme
217        ));
218    }
219    Ok(parsed.to_string().trim_end_matches('/').to_string())
220}
221
222/// Validate that a base URL does not point to a private/loopback address.
223/// Call this on user-supplied config (at configure time) to prevent SSRF.
224/// Not called for programmatically constructed configs (e.g. tests).
225///
226/// **Loopback is allowed.** Self-hosted embedding backends (e.g. Ollama at
227/// `http://127.0.0.1:11434`) are a primary use case for `aft_search`. Loopback
228/// addresses by definition cannot be exploited as SSRF targets — they only
229/// reach services on the same machine. Allowing loopback unblocks Ollama at its
230/// default config without opening up SSRF to LAN/intranet services, which
231/// remain rejected.
232///
233/// **mDNS `.local` is rejected.** mDNS hostnames typically resolve to LAN
234/// devices (printers, homelab servers); rejecting them before DNS lookup keeps
235/// the SSRF guard meaningful for non-loopback private networks.
236pub fn validate_base_url_no_ssrf(raw: &str) -> Result<(), String> {
237    use std::net::{IpAddr, ToSocketAddrs};
238
239    let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
240
241    let host = parsed.host_str().unwrap_or("");
242
243    // Loopback hostnames are explicitly allowed. RFC 6761 mandates that
244    // `localhost` and `*.localhost` resolve to loopback;
245    // `localhost.localdomain` is a historical alias used on some Linux
246    // distros. Self-hosted backends like Ollama use these by default.
247    let is_loopback_host =
248        host == "localhost" || host == "localhost.localdomain" || host.ends_with(".localhost");
249    if is_loopback_host {
250        return Ok(());
251    }
252
253    // mDNS hostnames are typically LAN devices, not loopback. Reject before
254    // DNS lookup so users get a clear error rather than a private-IP error.
255    if host.ends_with(".local") {
256        return Err(format!(
257            "base_url host '{host}' is an mDNS name — only loopback (localhost / 127.0.0.1) and public endpoints are allowed"
258        ));
259    }
260
261    // Resolve the hostname. Reject private/link-local/CGNAT IPs but NOT
262    // loopback (which is by definition same-machine and not an SSRF target).
263    let port = parsed.port_or_known_default().unwrap_or(443);
264    let addr_str = format!("{host}:{port}");
265    let addrs: Vec<IpAddr> = addr_str
266        .to_socket_addrs()
267        .map(|iter| iter.map(|sa| sa.ip()).collect())
268        .unwrap_or_default();
269    for ip in &addrs {
270        if is_private_non_loopback_ip(ip) {
271            return Err(format!(
272                "base_url '{raw}' resolves to a private/reserved IP — only loopback (127.0.0.1) and public endpoints are allowed"
273            ));
274        }
275    }
276
277    Ok(())
278}
279
280/// Returns true for IPv4/IPv6 addresses in private/link-local/CGNAT/wildcard
281/// ranges, EXCLUDING loopback (127.0.0.0/8 and ::1). Loopback is considered
282/// safe for SSRF purposes — see [`validate_base_url_no_ssrf`] for rationale.
283fn is_private_non_loopback_ip(ip: &std::net::IpAddr) -> bool {
284    use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
285    match ip {
286        IpAddr::V4(v4) => {
287            let o = v4.octets();
288            // Note: 127.0.0.0/8 (loopback) is intentionally NOT in this set.
289            // 10.0.0.0/8
290            o[0] == 10
291            // 172.16.0.0/12
292            || (o[0] == 172 && (16..=31).contains(&o[1]))
293            // 192.168.0.0/16
294            || (o[0] == 192 && o[1] == 168)
295            // 169.254.0.0/16 link-local
296            || (o[0] == 169 && o[1] == 254)
297            // 100.64.0.0/10 CGNAT
298            || (o[0] == 100 && (64..=127).contains(&o[1]))
299            // 0.0.0.0/8 wildcard
300            || o[0] == 0
301        }
302        IpAddr::V6(v6) => {
303            // Note: ::1 (loopback) is intentionally NOT in this set.
304            let _ = Ipv6Addr::LOCALHOST; // touch to silence unused-import lints in some builds
305                                         // fe80::/10 link-local
306            (v6.segments()[0] & 0xffc0) == 0xfe80
307            // fc00::/7 unique-local
308            || (v6.segments()[0] & 0xfe00) == 0xfc00
309            // ::ffff:0:0/96 IPv4-mapped — check the embedded IPv4
310            || (v6.segments()[0] == 0 && v6.segments()[1] == 0
311                && v6.segments()[2] == 0 && v6.segments()[3] == 0
312                && v6.segments()[4] == 0 && v6.segments()[5] == 0xffff
313                && {
314                    let [a, b] = v6.segments()[6..8] else { return false; };
315                    let ipv4 = Ipv4Addr::new((a >> 8) as u8, (a & 0xff) as u8, (b >> 8) as u8, (b & 0xff) as u8);
316                    is_private_non_loopback_ip(&IpAddr::V4(ipv4))
317                })
318        }
319    }
320}
321
322fn build_openai_embeddings_endpoint(base_url: &str) -> String {
323    if base_url.ends_with("/v1") {
324        format!("{base_url}{DEFAULT_OPENAI_EMBEDDING_PATH}")
325    } else {
326        format!("{base_url}/v1{}", DEFAULT_OPENAI_EMBEDDING_PATH)
327    }
328}
329
330fn build_ollama_embeddings_endpoint(base_url: &str) -> String {
331    if base_url.ends_with("/api") {
332        format!("{base_url}/embed")
333    } else {
334        format!("{base_url}{DEFAULT_OLLAMA_EMBEDDING_PATH}")
335    }
336}
337
338fn normalize_api_key(value: Option<String>) -> Option<String> {
339    value.and_then(|token| {
340        let token = token.trim();
341        if token.is_empty() {
342            None
343        } else {
344            Some(token.to_string())
345        }
346    })
347}
348
349fn is_retryable_embedding_status(status: reqwest::StatusCode) -> bool {
350    status.is_server_error() || status == reqwest::StatusCode::TOO_MANY_REQUESTS
351}
352
353fn is_retryable_embedding_error(error: &reqwest::Error) -> bool {
354    error.is_connect()
355}
356
357fn sleep_before_embedding_retry(attempt_index: usize) {
358    if let Some(delay_ms) = EMBEDDING_REQUEST_BACKOFF_MS.get(attempt_index) {
359        std::thread::sleep(Duration::from_millis(*delay_ms));
360    }
361}
362
363fn send_embedding_request<F>(mut make_request: F, backend_label: &str) -> Result<String, String>
364where
365    F: FnMut() -> reqwest::blocking::RequestBuilder,
366{
367    for attempt_index in 0..EMBEDDING_REQUEST_MAX_ATTEMPTS {
368        let last_attempt = attempt_index + 1 == EMBEDDING_REQUEST_MAX_ATTEMPTS;
369
370        let response = match make_request().send() {
371            Ok(response) => response,
372            Err(error) => {
373                if !last_attempt && is_retryable_embedding_error(&error) {
374                    sleep_before_embedding_retry(attempt_index);
375                    continue;
376                }
377                return Err(format!("{backend_label} request failed: {error}"));
378            }
379        };
380
381        let status = response.status();
382        let raw = match response.text() {
383            Ok(raw) => raw,
384            Err(error) => {
385                if !last_attempt && is_retryable_embedding_error(&error) {
386                    sleep_before_embedding_retry(attempt_index);
387                    continue;
388                }
389                return Err(format!("{backend_label} response read failed: {error}"));
390            }
391        };
392
393        if status.is_success() {
394            return Ok(raw);
395        }
396
397        if !last_attempt && is_retryable_embedding_status(status) {
398            sleep_before_embedding_retry(attempt_index);
399            continue;
400        }
401
402        return Err(format!(
403            "{backend_label} request failed (HTTP {}): {}",
404            status, raw
405        ));
406    }
407
408    unreachable!("embedding request retries exhausted without returning")
409}
410
411impl SemanticEmbeddingModel {
412    pub fn from_config(config: &SemanticBackendConfig) -> Result<Self, String> {
413        let timeout_ms = if config.timeout_ms == 0 {
414            DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS
415        } else {
416            config.timeout_ms
417        };
418
419        let max_batch_size = if config.max_batch_size == 0 {
420            DEFAULT_MAX_BATCH_SIZE
421        } else {
422            config.max_batch_size
423        };
424
425        let api_key_env = normalize_api_key(config.api_key_env.clone());
426        let model = config.model.clone();
427
428        let client = Client::builder()
429            .timeout(Duration::from_millis(timeout_ms))
430            .redirect(reqwest::redirect::Policy::none())
431            .build()
432            .map_err(|error| format!("failed to configure embedding client: {error}"))?;
433
434        let engine = match config.backend {
435            SemanticBackend::Fastembed => {
436                SemanticEmbeddingEngine::Fastembed(initialize_text_embedding(&model)?)
437            }
438            SemanticBackend::OpenAiCompatible => {
439                let raw = config.base_url.as_ref().ok_or_else(|| {
440                    "base_url is required for openai_compatible backend".to_string()
441                })?;
442                let base_url = normalize_base_url(raw)?;
443
444                let api_key = match api_key_env {
445                    Some(var_name) => Some(env::var(&var_name).map_err(|_| {
446                        format!("missing api_key_env '{var_name}' for openai_compatible backend")
447                    })?),
448                    None => None,
449                };
450
451                SemanticEmbeddingEngine::OpenAiCompatible {
452                    client,
453                    model,
454                    base_url,
455                    api_key,
456                }
457            }
458            SemanticBackend::Ollama => {
459                let raw = config
460                    .base_url
461                    .as_ref()
462                    .ok_or_else(|| "base_url is required for ollama backend".to_string())?;
463                let base_url = normalize_base_url(raw)?;
464
465                SemanticEmbeddingEngine::Ollama {
466                    client,
467                    model,
468                    base_url,
469                }
470            }
471        };
472
473        Ok(Self {
474            backend: config.backend,
475            model: config.model.clone(),
476            base_url: config.base_url.clone(),
477            timeout_ms,
478            max_batch_size,
479            dimension: None,
480            engine,
481            query_embedding_cache: HashMap::new(),
482            query_embedding_cache_order: VecDeque::new(),
483            query_embedding_cache_hits: 0,
484            query_embedding_cache_misses: 0,
485        })
486    }
487
488    pub fn backend(&self) -> SemanticBackend {
489        self.backend
490    }
491
492    pub fn model(&self) -> &str {
493        &self.model
494    }
495
496    pub fn base_url(&self) -> Option<&str> {
497        self.base_url.as_deref()
498    }
499
500    pub fn max_batch_size(&self) -> usize {
501        self.max_batch_size
502    }
503
504    pub fn timeout_ms(&self) -> u64 {
505        self.timeout_ms
506    }
507
508    pub fn fingerprint(
509        &mut self,
510        config: &SemanticBackendConfig,
511    ) -> Result<SemanticIndexFingerprint, String> {
512        let dimension = self.dimension()?;
513        Ok(SemanticIndexFingerprint::from_config(config, dimension))
514    }
515
516    pub fn dimension(&mut self) -> Result<usize, String> {
517        if let Some(dimension) = self.dimension {
518            return Ok(dimension);
519        }
520
521        let dimension = match &mut self.engine {
522            SemanticEmbeddingEngine::Fastembed(model) => {
523                let vectors = model
524                    .embed(vec!["semantic index fingerprint probe".to_string()], None)
525                    .map_err(|error| format_embedding_init_error(error.to_string()))?;
526                vectors
527                    .first()
528                    .map(|v| v.len())
529                    .ok_or_else(|| "embedding backend returned no vectors".to_string())?
530            }
531            SemanticEmbeddingEngine::OpenAiCompatible { .. } => {
532                let vectors =
533                    self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
534                vectors
535                    .first()
536                    .map(|v| v.len())
537                    .ok_or_else(|| "embedding backend returned no vectors".to_string())?
538            }
539            SemanticEmbeddingEngine::Ollama { .. } => {
540                let vectors =
541                    self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
542                vectors
543                    .first()
544                    .map(|v| v.len())
545                    .ok_or_else(|| "embedding backend returned no vectors".to_string())?
546            }
547        };
548
549        self.dimension = Some(dimension);
550        Ok(dimension)
551    }
552
553    pub fn embed(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
554        self.embed_texts(texts)
555    }
556
557    pub fn embed_query_cached(&mut self, query: &str) -> Result<Vec<f32>, String> {
558        if let Some(vector) = self.query_embedding_cache.get(query) {
559            self.query_embedding_cache_hits += 1;
560            return Ok(vector.clone());
561        }
562
563        self.query_embedding_cache_misses += 1;
564        let embeddings = self.embed_texts(vec![query.to_string()])?;
565        let vector = embeddings
566            .first()
567            .cloned()
568            .ok_or_else(|| "embedding model returned no query vector".to_string())?;
569
570        if self.query_embedding_cache.len() >= QUERY_EMBEDDING_CACHE_CAP {
571            if let Some(oldest) = self.query_embedding_cache_order.pop_front() {
572                self.query_embedding_cache.remove(&oldest);
573            }
574        }
575        self.query_embedding_cache
576            .insert(query.to_string(), vector.clone());
577        self.query_embedding_cache_order
578            .push_back(query.to_string());
579
580        Ok(vector)
581    }
582
583    pub fn query_embedding_cache_stats(&self) -> (u64, u64, usize) {
584        (
585            self.query_embedding_cache_hits,
586            self.query_embedding_cache_misses,
587            self.query_embedding_cache.len(),
588        )
589    }
590
591    fn embed_texts(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
592        match &mut self.engine {
593            SemanticEmbeddingEngine::Fastembed(model) => model
594                .embed(texts, None::<usize>)
595                .map_err(|error| format_embedding_init_error(error.to_string()))
596                .map_err(|error| format!("failed to embed batch: {error}")),
597            SemanticEmbeddingEngine::OpenAiCompatible {
598                client,
599                model,
600                base_url,
601                api_key,
602            } => {
603                let expected_text_count = texts.len();
604                let endpoint = build_openai_embeddings_endpoint(base_url);
605                let body = serde_json::json!({
606                    "input": texts,
607                    "model": model,
608                });
609
610                let raw = send_embedding_request(
611                    || {
612                        // `.json(&body)` sets Content-Type: application/json
613                        // automatically. Do NOT add `.header("Content-Type",
614                        // "application/json")` afterwards — RequestBuilder::header()
615                        // calls HeaderMap::append, which produces TWO Content-Type
616                        // headers on the wire. OpenAI's /v1/embeddings endpoint
617                        // treats duplicate Content-Type as malformed and rejects
618                        // the body with 400 "you must provide a model parameter"
619                        // even when `model` is set. Verified end-to-end against
620                        // api.openai.com. See issue #36.
621                        let mut request = client.post(&endpoint).json(&body);
622
623                        if let Some(api_key) = api_key {
624                            request = request.header("Authorization", format!("Bearer {api_key}"));
625                        }
626
627                        request
628                    },
629                    "openai compatible",
630                )?;
631
632                #[derive(Deserialize)]
633                struct OpenAiResponse {
634                    data: Vec<OpenAiEmbeddingResult>,
635                }
636
637                #[derive(Deserialize)]
638                struct OpenAiEmbeddingResult {
639                    embedding: Vec<f32>,
640                    index: Option<u32>,
641                }
642
643                let parsed: OpenAiResponse = serde_json::from_str(&raw)
644                    .map_err(|error| format!("invalid openai compatible response: {error}"))?;
645                if parsed.data.len() != expected_text_count {
646                    return Err(format!(
647                        "openai compatible response returned {} embeddings for {} inputs",
648                        parsed.data.len(),
649                        expected_text_count
650                    ));
651                }
652
653                let mut vectors = vec![Vec::new(); parsed.data.len()];
654                for (i, item) in parsed.data.into_iter().enumerate() {
655                    let index = item.index.unwrap_or(i as u32) as usize;
656                    if index >= vectors.len() {
657                        return Err(
658                            "openai compatible response contains invalid vector index".to_string()
659                        );
660                    }
661                    vectors[index] = item.embedding;
662                }
663
664                for vector in &vectors {
665                    if vector.is_empty() {
666                        return Err(
667                            "openai compatible response contained missing vectors".to_string()
668                        );
669                    }
670                }
671
672                self.dimension = vectors.first().map(Vec::len);
673                Ok(vectors)
674            }
675            SemanticEmbeddingEngine::Ollama {
676                client,
677                model,
678                base_url,
679            } => {
680                let expected_text_count = texts.len();
681                let endpoint = build_ollama_embeddings_endpoint(base_url);
682
683                #[derive(Serialize)]
684                struct OllamaPayload<'a> {
685                    model: &'a str,
686                    input: Vec<String>,
687                }
688
689                let payload = OllamaPayload {
690                    model,
691                    input: texts,
692                };
693
694                let raw = send_embedding_request(
695                    || {
696                        // `.json(&payload)` sets Content-Type automatically.
697                        // Same duplicate-header trap as the OpenAI branch above
698                        // — most Ollama servers tolerate it, but the
699                        // single-Content-Type form is the correct one.
700                        client.post(&endpoint).json(&payload)
701                    },
702                    "ollama",
703                )?;
704
705                #[derive(Deserialize)]
706                struct OllamaResponse {
707                    embeddings: Vec<Vec<f32>>,
708                }
709
710                let parsed: OllamaResponse = serde_json::from_str(&raw)
711                    .map_err(|error| format!("invalid ollama response: {error}"))?;
712                if parsed.embeddings.is_empty() {
713                    return Err("ollama response returned no embeddings".to_string());
714                }
715                if parsed.embeddings.len() != expected_text_count {
716                    return Err(format!(
717                        "ollama response returned {} embeddings for {} inputs",
718                        parsed.embeddings.len(),
719                        expected_text_count
720                    ));
721                }
722
723                let vectors = parsed.embeddings;
724                for vector in &vectors {
725                    if vector.is_empty() {
726                        return Err("ollama response contained empty embeddings".to_string());
727                    }
728                }
729
730                self.dimension = vectors.first().map(Vec::len);
731                Ok(vectors)
732            }
733        }
734    }
735}
736
737/// Pre-validate ONNX Runtime by attempting a raw dlopen before ort touches it.
738/// This catches broken/incompatible .so files without risking a panic in the ort crate.
739/// Also checks the runtime version via OrtGetApiBase if available.
740pub fn pre_validate_onnx_runtime() -> Result<(), String> {
741    let dylib_path = std::env::var("ORT_DYLIB_PATH").ok();
742
743    #[cfg(any(target_os = "linux", target_os = "macos"))]
744    {
745        #[cfg(target_os = "linux")]
746        let default_name = "libonnxruntime.so";
747        #[cfg(target_os = "macos")]
748        let default_name = "libonnxruntime.dylib";
749
750        let lib_name = dylib_path.as_deref().unwrap_or(default_name);
751
752        unsafe {
753            let c_name = std::ffi::CString::new(lib_name)
754                .map_err(|e| format!("invalid library path: {}", e))?;
755            let handle = libc::dlopen(c_name.as_ptr(), libc::RTLD_NOW);
756            if handle.is_null() {
757                let err = libc::dlerror();
758                let msg = if err.is_null() {
759                    "unknown dlopen error".to_string()
760                } else {
761                    std::ffi::CStr::from_ptr(err).to_string_lossy().into_owned()
762                };
763                return Err(format!(
764                    "ONNX Runtime not found. dlopen('{}') failed: {}. \
765                     Run `npx @cortexkit/aft doctor` to diagnose.",
766                    lib_name, msg
767                ));
768            }
769
770            // Try to detect the runtime version from the file path or soname.
771            // libonnxruntime.so.1.19.0, libonnxruntime.1.24.4.dylib, etc.
772            let detected_version = detect_ort_version_from_path(lib_name);
773
774            libc::dlclose(handle);
775
776            // Check version compatibility — we need 1.24.x
777            if let Some(ref version) = detected_version {
778                let parts: Vec<&str> = version.split('.').collect();
779                if let (Some(major), Some(minor)) = (
780                    parts.first().and_then(|s| s.parse::<u32>().ok()),
781                    parts.get(1).and_then(|s| s.parse::<u32>().ok()),
782                ) {
783                    if major != 1 || minor < 20 {
784                        return Err(format_ort_version_mismatch(version, lib_name));
785                    }
786                }
787            }
788        }
789    }
790
791    #[cfg(target_os = "windows")]
792    {
793        // Validate ONNX Runtime availability on Windows by loading the DLL
794        // via LoadLibraryExW before the ort crate attempts its own LoadLibrary.
795        // This way we can produce a friendly error (with installation hints)
796        // instead of a raw LoadLibrary failure from deep inside fastembed.
797        let lib_name = dylib_path.as_deref().unwrap_or("onnxruntime.dll");
798
799        // Use kernel32 LoadLibraryExW for the validation — built-in, no
800        // crate dependency required. GetModuleFileNameW resolves the loaded
801        // DLL path for version probing via the version.dll API.
802        #[link(name = "kernel32")]
803        extern "system" {
804            fn LoadLibraryExW(
805                lpLibFileName: *const u16,
806                hFile: *mut std::ffi::c_void,
807                dwFlags: u32,
808            ) -> *mut std::ffi::c_void;
809            fn FreeLibrary(hLibModule: *mut std::ffi::c_void) -> i32;
810            fn GetModuleFileNameW(
811                hModule: *mut std::ffi::c_void,
812                lpFilename: *mut u16,
813                nSize: u32,
814            ) -> u32;
815        }
816
817        #[link(name = "version")]
818        extern "system" {
819            fn GetFileVersionInfoSizeW(lptstrFilename: *const u16, lpdwHandle: *mut u32) -> u32;
820            fn GetFileVersionInfoW(
821                lptstrFilename: *const u16,
822                dwHandle: u32,
823                dwLen: u32,
824                lpData: *mut std::ffi::c_void,
825            ) -> i32;
826            fn VerQueryValueW(
827                pBlock: *mut std::ffi::c_void,
828                lpSubBlock: *const u16,
829                lplpBuffer: *mut *mut std::ffi::c_void,
830                puLen: *mut u32,
831            ) -> i32;
832        }
833
834        #[repr(C)]
835        struct VS_FIXEDFILEINFO {
836            dw_signature: u32,
837            dw_struc_version: u32,
838            dw_file_version_ms: u32, // HIWORD major, LOWORD minor
839            dw_file_version_ls: u32, // HIWORD build, LOWORD revision
840            dw_product_version_ms: u32,
841            dw_product_version_ls: u32,
842            dw_file_flags_mask: u32,
843            dw_file_flags: u32,
844            dw_file_os: u32,
845            dw_file_type: u32,
846            dw_file_subtype: u32,
847            dw_file_date_ms: u32,
848            dw_file_date_ls: u32,
849        }
850
851        unsafe {
852            use std::os::windows::ffi::OsStrExt;
853            let wide: Vec<u16> = std::ffi::OsStr::new(lib_name)
854                .encode_wide()
855                .chain(std::iter::once(0))
856                .collect();
857
858            let handle = LoadLibraryExW(wide.as_ptr(), std::ptr::null_mut(), 0);
859            if handle.is_null() {
860                let err = std::io::Error::last_os_error();
861                return Err(format!(
862                    "ONNX Runtime not found. LoadLibraryExW('{}') failed: {}. \
863                     Run `npx @cortexkit/aft doctor` to diagnose.",
864                    lib_name, err
865                ));
866            }
867
868            // Probe the file version from PE resources so we can reject
869            // outdated DLLs (e.g. v1.9.x) before the ort crate panics.
870            let mut detected_major: u32 = 0;
871            let mut detected_minor: u32 = 0;
872            // Use MAX_UNICODEPATH (32767) so deeply nested ORT paths (e.g.
873            // long NuGet package paths under %USERPROFILE%) never truncate.
874            // GetModuleFileNameW truncates silently when the buffer is too
875            // small, which causes version probing to fail and the version
876            // check to be bypassed — better to allocate generously.
877            let mut path_buf = [0u16; 32767];
878            let path_len = GetModuleFileNameW(handle, path_buf.as_mut_ptr(), 32767);
879            if path_len > 0 {
880                let mut dummy_handle: u32 = 0;
881                let info_size = GetFileVersionInfoSizeW(path_buf.as_ptr(), &mut dummy_handle);
882                if info_size > 0 {
883                    let mut info = vec![0u8; info_size as usize];
884                    if GetFileVersionInfoW(
885                        path_buf.as_ptr(),
886                        0,
887                        info_size,
888                        info.as_mut_ptr() as *mut std::ffi::c_void,
889                    ) != 0
890                    {
891                        let sub_block = "\\\0".encode_utf16().collect::<Vec<u16>>();
892                        let mut vs_info: *mut std::ffi::c_void = std::ptr::null_mut();
893                        let mut vs_len: u32 = 0;
894                        if VerQueryValueW(
895                            info.as_mut_ptr() as *mut std::ffi::c_void,
896                            sub_block.as_ptr(),
897                            &mut vs_info,
898                            &mut vs_len,
899                        ) != 0
900                            && !vs_info.is_null()
901                        {
902                            let fixed = vs_info as *const VS_FIXEDFILEINFO;
903                            detected_major = (*fixed).dw_file_version_ms >> 16;
904                            detected_minor = (*fixed).dw_file_version_ms & 0xFFFF;
905                        }
906                    }
907                }
908            }
909
910            FreeLibrary(handle);
911
912            // Version compatibility check (mirrors the Linux/macOS path).
913            // If version could not be detected (detected_major == 0) we let
914            // the load succeed — the ort crate will diagnose further.
915            if detected_major != 0 && (detected_major != 1 || detected_minor < 20) {
916                let ver = format!("{}.{}", detected_major, detected_minor);
917                return Err(format_ort_version_mismatch(&ver, lib_name));
918            }
919        }
920    }
921
922    Ok(())
923}
924
925/// Try to extract the ORT version from the library filename or resolved symlink.
926/// Examples: "libonnxruntime.so.1.19.0" → "1.19.0", "libonnxruntime.1.24.4.dylib" → "1.24.4"
927#[cfg(any(test, target_os = "linux", target_os = "macos"))]
928fn detect_ort_version_from_path(lib_path: &str) -> Option<String> {
929    let path = std::path::Path::new(lib_path);
930
931    // Try the path as given, then follow symlinks
932    for candidate in [Some(path.to_path_buf()), std::fs::canonicalize(path).ok()]
933        .into_iter()
934        .flatten()
935    {
936        if let Some(name) = candidate.file_name().and_then(|n| n.to_str()) {
937            if let Some(version) = extract_version_from_filename(name) {
938                return Some(version);
939            }
940        }
941    }
942
943    // Also check for versioned siblings in the same directory
944    if let Some(parent) = path.parent() {
945        if let Ok(entries) = std::fs::read_dir(parent) {
946            for entry in entries.flatten() {
947                if let Some(name) = entry.file_name().to_str() {
948                    if name.starts_with("libonnxruntime") {
949                        if let Some(version) = extract_version_from_filename(name) {
950                            return Some(version);
951                        }
952                    }
953                }
954            }
955        }
956    }
957
958    None
959}
960
961/// Extract version from filenames like "libonnxruntime.so.1.19.0" or "libonnxruntime.1.24.4.dylib"
962#[cfg(any(test, target_os = "linux", target_os = "macos"))]
963fn extract_version_from_filename(name: &str) -> Option<String> {
964    // Match patterns: .so.X.Y.Z or .X.Y.Z.dylib or .X.Y.Z.so
965    let re = regex::Regex::new(r"(\d+\.\d+\.\d+)").ok()?;
966    re.find(name).map(|m| m.as_str().to_string())
967}
968
969fn suggest_removal_command(lib_path: &str) -> String {
970    if lib_path.starts_with("/usr/local/lib")
971        || lib_path == "libonnxruntime.so"
972        || lib_path == "libonnxruntime.dylib"
973    {
974        #[cfg(target_os = "linux")]
975        return "   sudo rm /usr/local/lib/libonnxruntime* && sudo ldconfig".to_string();
976        #[cfg(target_os = "macos")]
977        return "   sudo rm /usr/local/lib/libonnxruntime*".to_string();
978    }
979    format!("   rm '{}'", lib_path)
980}
981
982/// Build the user-facing error message for an incompatible ONNX Runtime
983/// install. Extracted as a pure helper so we can unit-test the wording
984/// stability — the auto-fix recommendation must always come first because
985/// it's the only safe option, and the system-rm step must remain present
986/// because some users prefer the system-wide cleanup path.
987pub(crate) fn format_ort_version_mismatch(version: &str, lib_name: &str) -> String {
988    format!(
989        "ONNX Runtime version mismatch: found v{} at '{}', but AFT requires v1.20+. \
990         Solutions:\n\
991         1. Auto-fix (recommended): run `npx @cortexkit/aft doctor --fix`. \
992         This downloads AFT-managed ONNX Runtime v1.24 into AFT's storage and \
993         configures the bridge to load it instead of the system library — no \
994         changes to '{}'.\n\
995         2. Remove the old library and restart (AFT auto-downloads the correct version on next start):\n\
996         {}\n\
997         3. Or install ONNX Runtime 1.24 system-wide: https://github.com/microsoft/onnxruntime/releases/tag/v1.24.0\n\
998         4. Run `npx @cortexkit/aft doctor` for full diagnostics.",
999        version,
1000        lib_name,
1001        lib_name,
1002        suggest_removal_command(lib_name),
1003    )
1004}
1005
1006pub fn initialize_text_embedding(model: &str) -> Result<TextEmbedding, String> {
1007    // Pre-validate before ort can panic on a bad library
1008    pre_validate_onnx_runtime()?;
1009
1010    let selected_model = match model {
1011        "all-MiniLM-L6-v2" | "all-minilm-l6-v2" => FastembedEmbeddingModel::AllMiniLML6V2,
1012        _ => {
1013            return Err(format!(
1014                "unsupported fastembed model '{}'. Supported: all-MiniLM-L6-v2",
1015                model
1016            ))
1017        }
1018    };
1019
1020    TextEmbedding::try_new(InitOptions::new(selected_model)).map_err(format_embedding_init_error)
1021}
1022
1023pub fn is_onnx_runtime_unavailable(message: &str) -> bool {
1024    if message.trim_start().starts_with("ONNX Runtime not found.") {
1025        return true;
1026    }
1027
1028    let message = message.to_ascii_lowercase();
1029    let mentions_onnx_runtime = ["onnx runtime", "onnxruntime", "libonnxruntime"]
1030        .iter()
1031        .any(|pattern| message.contains(pattern));
1032    let mentions_dynamic_load_failure = [
1033        "shared library",
1034        "dynamic library",
1035        "failed to load",
1036        "could not load",
1037        "unable to load",
1038        "dlopen",
1039        "loadlibrary",
1040        "no such file",
1041        "not found",
1042    ]
1043    .iter()
1044    .any(|pattern| message.contains(pattern));
1045
1046    mentions_onnx_runtime && mentions_dynamic_load_failure
1047}
1048
1049fn format_embedding_init_error(error: impl Display) -> String {
1050    let message = error.to_string();
1051
1052    if is_onnx_runtime_unavailable(&message) {
1053        return format!("{ONNX_RUNTIME_INSTALL_HINT} Original error: {message}");
1054    }
1055
1056    format!("failed to initialize semantic embedding model: {message}")
1057}
1058
1059/// A chunk of code ready for embedding — derived from a Symbol with context enrichment
1060#[derive(Debug, Clone)]
1061pub struct SemanticChunk {
1062    /// Absolute file path
1063    pub file: PathBuf,
1064    /// Symbol name
1065    pub name: String,
1066    /// Symbol kind (function, class, struct, etc.)
1067    pub kind: SymbolKind,
1068    /// Line range (0-based internally, inclusive)
1069    pub start_line: u32,
1070    pub end_line: u32,
1071    /// Whether the symbol is exported
1072    pub exported: bool,
1073    /// The enriched text that gets embedded (scope + signature + body snippet)
1074    pub embed_text: String,
1075    /// Short code snippet for display in results
1076    pub snippet: String,
1077}
1078
1079/// A stored embedding entry — chunk metadata + vector
1080#[derive(Debug, Clone)]
1081pub struct EmbeddingEntry {
1082    chunk: SemanticChunk,
1083    vector: Vec<f32>,
1084}
1085
1086/// The semantic index — stores embeddings for all symbols in a project
1087#[derive(Debug, Clone)]
1088pub struct SemanticIndex {
1089    entries: Vec<EmbeddingEntry>,
1090    /// Track which files are indexed and their mtime for staleness detection
1091    file_mtimes: HashMap<PathBuf, SystemTime>,
1092    /// Track indexed file sizes alongside mtimes for staleness detection
1093    file_sizes: HashMap<PathBuf, u64>,
1094    file_hashes: HashMap<PathBuf, blake3::Hash>,
1095    /// Embedding dimension (384 for MiniLM-L6-v2)
1096    dimension: usize,
1097    fingerprint: Option<SemanticIndexFingerprint>,
1098    project_root: PathBuf,
1099    deferred_files: HashSet<PathBuf>,
1100}
1101
1102#[derive(Debug, Clone, Copy)]
1103struct IndexedFileMetadata {
1104    mtime: SystemTime,
1105    size: u64,
1106    content_hash: blake3::Hash,
1107}
1108
1109/// Result of an incremental refresh of the semantic index. Counts are file
1110/// counts; `total_processed` is the number of current/deleted files considered.
1111#[derive(Debug, Default, Clone, Copy)]
1112pub struct RefreshSummary {
1113    pub changed: usize,
1114    pub added: usize,
1115    pub deleted: usize,
1116    pub total_processed: usize,
1117}
1118
1119impl RefreshSummary {
1120    /// True when no files were touched.
1121    pub fn is_noop(&self) -> bool {
1122        self.changed == 0 && self.added == 0 && self.deleted == 0
1123    }
1124}
1125
1126#[derive(Debug, Default)]
1127pub struct InvalidatedFilesRefresh {
1128    pub added_entries: Vec<EmbeddingEntry>,
1129    pub updated_metadata: Vec<(PathBuf, FileFreshness)>,
1130    pub completed_paths: Vec<PathBuf>,
1131    pub summary: RefreshSummary,
1132}
1133
1134/// Search result from a semantic query
1135#[derive(Debug, Clone)]
1136pub struct SemanticResult {
1137    pub file: PathBuf,
1138    pub name: String,
1139    pub kind: SymbolKind,
1140    pub start_line: u32,
1141    pub end_line: u32,
1142    pub exported: bool,
1143    pub snippet: String,
1144    pub score: f32,
1145    pub source: &'static str,
1146}
1147
1148impl SemanticIndex {
1149    pub fn new(project_root: PathBuf, dimension: usize) -> Self {
1150        debug_assert!(project_root.is_absolute());
1151        Self {
1152            entries: Vec::new(),
1153            file_mtimes: HashMap::new(),
1154            file_sizes: HashMap::new(),
1155            file_hashes: HashMap::new(),
1156            dimension,
1157            fingerprint: None,
1158            project_root,
1159            deferred_files: HashSet::new(),
1160        }
1161    }
1162
1163    /// Number of embedded symbol entries.
1164    pub fn entry_count(&self) -> usize {
1165        self.entries.len()
1166    }
1167
1168    /// Number of files currently tracked by the semantic index.
1169    pub fn indexed_file_count(&self) -> usize {
1170        self.file_mtimes.len()
1171    }
1172
1173    /// Human-readable status label for the index.
1174    pub fn status_label(&self) -> &'static str {
1175        if self.entries.is_empty() {
1176            "empty"
1177        } else {
1178            "ready"
1179        }
1180    }
1181
1182    fn collect_chunks(
1183        project_root: &Path,
1184        files: &[PathBuf],
1185    ) -> (Vec<SemanticChunk>, HashMap<PathBuf, IndexedFileMetadata>) {
1186        let per_file: Vec<(
1187            PathBuf,
1188            Result<(IndexedFileMetadata, Vec<SemanticChunk>), String>,
1189        )> = files
1190            .par_iter()
1191            .map_init(HashMap::new, |parsers, file| {
1192                let result = collect_file_metadata(file).and_then(|metadata| {
1193                    collect_file_chunks(project_root, file, parsers)
1194                        .map(|chunks| (metadata, chunks))
1195                });
1196                (file.clone(), result)
1197            })
1198            .collect();
1199
1200        let mut chunks: Vec<SemanticChunk> = Vec::new();
1201        let mut file_metadata: HashMap<PathBuf, IndexedFileMetadata> = HashMap::new();
1202
1203        for (file, result) in per_file {
1204            match result {
1205                Ok((metadata, file_chunks)) => {
1206                    file_metadata.insert(file, metadata);
1207                    chunks.extend(file_chunks);
1208                }
1209                Err(error) => {
1210                    // "unsupported file extension" is expected for non-code files
1211                    // (json, xml, .gitignore, etc.) that get included in the
1212                    // project walk. Pre-fix this was swallowed by .unwrap_or_default();
1213                    // we now skip silently to keep the log clean. Only real read/parse
1214                    // errors are worth surfacing.
1215                    if error == "unsupported file extension" {
1216                        continue;
1217                    }
1218                    slog_warn!(
1219                        "failed to collect semantic chunks for {}: {}",
1220                        file.display(),
1221                        error
1222                    );
1223                }
1224            }
1225        }
1226
1227        (chunks, file_metadata)
1228    }
1229
1230    fn build_from_chunks<F, P>(
1231        project_root: &Path,
1232        chunks: Vec<SemanticChunk>,
1233        file_metadata: HashMap<PathBuf, IndexedFileMetadata>,
1234        embed_fn: &mut F,
1235        max_batch_size: usize,
1236        mut progress: Option<&mut P>,
1237    ) -> Result<Self, String>
1238    where
1239        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1240        P: FnMut(usize, usize),
1241    {
1242        debug_assert!(project_root.is_absolute());
1243        let total_chunks = chunks.len();
1244
1245        if chunks.is_empty() {
1246            return Ok(Self {
1247                entries: Vec::new(),
1248                file_mtimes: file_metadata
1249                    .iter()
1250                    .map(|(path, metadata)| (path.clone(), metadata.mtime))
1251                    .collect(),
1252                file_sizes: file_metadata
1253                    .iter()
1254                    .map(|(path, metadata)| (path.clone(), metadata.size))
1255                    .collect(),
1256                file_hashes: file_metadata
1257                    .into_iter()
1258                    .map(|(path, metadata)| (path, metadata.content_hash))
1259                    .collect(),
1260                dimension: DEFAULT_DIMENSION,
1261                fingerprint: None,
1262                project_root: project_root.to_path_buf(),
1263                deferred_files: HashSet::new(),
1264            });
1265        }
1266
1267        // Embed in batches
1268        let mut entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1269        let mut expected_dimension: Option<usize> = None;
1270        let batch_size = max_batch_size.max(1);
1271        for batch_start in (0..chunks.len()).step_by(batch_size) {
1272            let batch_end = (batch_start + batch_size).min(chunks.len());
1273            let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1274                .iter()
1275                .map(|c| c.embed_text.clone())
1276                .collect();
1277
1278            let vectors = embed_fn(batch_texts)?;
1279            validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1280
1281            // Track consistent dimension across all batches
1282            if let Some(dim) = vectors.first().map(|v| v.len()) {
1283                match expected_dimension {
1284                    None => expected_dimension = Some(dim),
1285                    Some(expected) if dim != expected => {
1286                        return Err(format!(
1287                            "embedding dimension changed across batches: expected {expected}, got {dim}"
1288                        ));
1289                    }
1290                    _ => {}
1291                }
1292            }
1293
1294            for (i, vector) in vectors.into_iter().enumerate() {
1295                let chunk_idx = batch_start + i;
1296                entries.push(EmbeddingEntry {
1297                    chunk: chunks[chunk_idx].clone(),
1298                    vector,
1299                });
1300            }
1301
1302            if let Some(callback) = progress.as_mut() {
1303                callback(entries.len(), total_chunks);
1304            }
1305        }
1306
1307        let dimension = entries
1308            .first()
1309            .map(|e| e.vector.len())
1310            .unwrap_or(DEFAULT_DIMENSION);
1311
1312        Ok(Self {
1313            entries,
1314            file_mtimes: file_metadata
1315                .iter()
1316                .map(|(path, metadata)| (path.clone(), metadata.mtime))
1317                .collect(),
1318            file_sizes: file_metadata
1319                .iter()
1320                .map(|(path, metadata)| (path.clone(), metadata.size))
1321                .collect(),
1322            file_hashes: file_metadata
1323                .into_iter()
1324                .map(|(path, metadata)| (path, metadata.content_hash))
1325                .collect(),
1326            dimension,
1327            fingerprint: None,
1328            project_root: project_root.to_path_buf(),
1329            deferred_files: HashSet::new(),
1330        })
1331    }
1332
1333    /// Build the semantic index from a set of files using the provided embedding function.
1334    /// `embed_fn` takes a batch of texts and returns a batch of embedding vectors.
1335    pub fn build<F>(
1336        project_root: &Path,
1337        files: &[PathBuf],
1338        embed_fn: &mut F,
1339        max_batch_size: usize,
1340    ) -> Result<Self, String>
1341    where
1342        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1343    {
1344        let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1345        Self::build_from_chunks(
1346            project_root,
1347            chunks,
1348            file_mtimes,
1349            embed_fn,
1350            max_batch_size,
1351            Option::<&mut fn(usize, usize)>::None,
1352        )
1353    }
1354
1355    /// Build the semantic index and report embedding progress using entry counts.
1356    pub fn build_with_progress<F, P>(
1357        project_root: &Path,
1358        files: &[PathBuf],
1359        embed_fn: &mut F,
1360        max_batch_size: usize,
1361        progress: &mut P,
1362    ) -> Result<Self, String>
1363    where
1364        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1365        P: FnMut(usize, usize),
1366    {
1367        let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1368        let total_chunks = chunks.len();
1369        progress(0, total_chunks);
1370        Self::build_from_chunks(
1371            project_root,
1372            chunks,
1373            file_mtimes,
1374            embed_fn,
1375            max_batch_size,
1376            Some(progress),
1377        )
1378    }
1379
1380    /// Incrementally refresh entries for changed/new files only, preserving cached
1381    /// embeddings for unchanged files. Used when loading the index from disk and
1382    /// finding that a small fraction of files have moved on, deleted, or appeared.
1383    ///
1384    /// Returns `RefreshSummary` describing what changed. On success, `self` is
1385    /// mutated in place and remains a valid index.
1386    ///
1387    /// `current_files` is the full set of files the project considers indexable
1388    /// (typically `walk_project_files(...)`). Files in the cache that are no
1389    /// longer in this set are treated as deleted.
1390    pub fn refresh_stale_files<F, P>(
1391        &mut self,
1392        project_root: &Path,
1393        current_files: &[PathBuf],
1394        embed_fn: &mut F,
1395        max_batch_size: usize,
1396        progress: &mut P,
1397    ) -> Result<RefreshSummary, String>
1398    where
1399        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1400        P: FnMut(usize, usize),
1401    {
1402        self.backfill_missing_file_sizes();
1403
1404        // 1. Bucket files into deleted / changed / added.
1405        let current_set: HashSet<&Path> = current_files.iter().map(PathBuf::as_path).collect();
1406        self.deferred_files
1407            .retain(|path| current_set.contains(path.as_path()));
1408        let total_processed = current_set.len() + self.file_mtimes.len()
1409            - self
1410                .file_mtimes
1411                .keys()
1412                .filter(|path| current_set.contains(path.as_path()))
1413                .count();
1414
1415        // Files in cache that disappeared from disk OR are no longer in the
1416        // walked set. Both cases need their entries dropped.
1417        let mut deleted: Vec<PathBuf> = Vec::new();
1418        let mut changed: Vec<PathBuf> = Vec::new();
1419        let indexed_paths: Vec<PathBuf> = self.file_mtimes.keys().cloned().collect();
1420        for indexed_path in &indexed_paths {
1421            if !current_set.contains(indexed_path.as_path()) {
1422                deleted.push(indexed_path.clone());
1423                continue;
1424            }
1425            let cached = match (
1426                self.file_mtimes.get(indexed_path),
1427                self.file_sizes.get(indexed_path),
1428                self.file_hashes.get(indexed_path),
1429            ) {
1430                (Some(mtime), Some(size), Some(hash)) => Some(FileFreshness {
1431                    mtime: *mtime,
1432                    size: *size,
1433                    content_hash: *hash,
1434                }),
1435                _ => None,
1436            };
1437            match cached
1438                .map(|freshness| cache_freshness::verify_file_strict(indexed_path, &freshness))
1439            {
1440                Some(FreshnessVerdict::HotFresh) => {}
1441                Some(FreshnessVerdict::ContentFresh {
1442                    new_mtime,
1443                    new_size,
1444                }) => {
1445                    self.file_mtimes.insert(indexed_path.clone(), new_mtime);
1446                    self.file_sizes.insert(indexed_path.clone(), new_size);
1447                }
1448                Some(FreshnessVerdict::Stale | FreshnessVerdict::Deleted) | None => {
1449                    changed.push(indexed_path.clone());
1450                }
1451            }
1452        }
1453
1454        // Files in walk that were never indexed.
1455        let mut added: Vec<PathBuf> = Vec::new();
1456        for path in current_files {
1457            if !self.file_mtimes.contains_key(path) {
1458                added.push(path.clone());
1459            }
1460        }
1461
1462        // Fast path: nothing to do.
1463        if deleted.is_empty() && changed.is_empty() && added.is_empty() {
1464            progress(0, 0);
1465            return Ok(RefreshSummary {
1466                total_processed,
1467                ..RefreshSummary::default()
1468            });
1469        }
1470
1471        // 2. Drop entries for deleted files immediately. Changed files are only
1472        //    replaced after successful re-extraction + embedding so transient
1473        //    read/parse errors keep the stale-but-valid cache entry.
1474        if !deleted.is_empty() {
1475            self.remove_indexed_files(&deleted);
1476        }
1477
1478        // 3. Embed the changed + added set, if any.
1479        let mut to_embed: Vec<PathBuf> = Vec::with_capacity(changed.len() + added.len());
1480        to_embed.extend(changed.iter().cloned());
1481        to_embed.extend(added.iter().cloned());
1482
1483        if to_embed.is_empty() {
1484            // Only deletions happened.
1485            progress(0, 0);
1486            return Ok(RefreshSummary {
1487                changed: 0,
1488                added: 0,
1489                deleted: deleted.len(),
1490                total_processed,
1491            });
1492        }
1493
1494        let (chunks, fresh_metadata) = Self::collect_chunks(project_root, &to_embed);
1495        let changed_set: HashSet<&Path> = changed.iter().map(PathBuf::as_path).collect();
1496        let vanished = to_embed
1497            .iter()
1498            .filter(|path| {
1499                changed_set.contains(path.as_path())
1500                    && !fresh_metadata.contains_key(*path)
1501                    && !path.exists()
1502            })
1503            .cloned()
1504            .collect::<Vec<_>>();
1505        if !vanished.is_empty() {
1506            self.remove_indexed_files(&vanished);
1507            deleted.extend(vanished);
1508        }
1509
1510        if chunks.is_empty() {
1511            progress(0, 0);
1512            let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1513            for file in &successful_files {
1514                self.deferred_files.remove(file);
1515            }
1516            if !successful_files.is_empty() {
1517                self.entries
1518                    .retain(|entry| !successful_files.contains(&entry.chunk.file));
1519            }
1520            let changed_count = changed
1521                .iter()
1522                .filter(|path| successful_files.contains(*path))
1523                .count();
1524            let added_count = added
1525                .iter()
1526                .filter(|path| successful_files.contains(*path))
1527                .count();
1528            for (file, metadata) in fresh_metadata {
1529                self.file_mtimes.insert(file.clone(), metadata.mtime);
1530                self.file_sizes.insert(file.clone(), metadata.size);
1531                self.file_hashes.insert(file.clone(), metadata.content_hash);
1532            }
1533            return Ok(RefreshSummary {
1534                changed: changed_count,
1535                added: added_count,
1536                deleted: deleted.len(),
1537                total_processed,
1538            });
1539        }
1540
1541        // 4. Embed in batches and dimension-check against the existing index.
1542        let total_chunks = chunks.len();
1543        progress(0, total_chunks);
1544        let batch_size = max_batch_size.max(1);
1545        let existing_dimension = if self.entries.is_empty() {
1546            None
1547        } else {
1548            Some(self.dimension)
1549        };
1550        let mut new_entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1551        let mut observed_dimension: Option<usize> = existing_dimension;
1552
1553        for batch_start in (0..chunks.len()).step_by(batch_size) {
1554            let batch_end = (batch_start + batch_size).min(chunks.len());
1555            let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1556                .iter()
1557                .map(|c| c.embed_text.clone())
1558                .collect();
1559
1560            let vectors = embed_fn(batch_texts)?;
1561            validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1562
1563            if let Some(dim) = vectors.first().map(|v| v.len()) {
1564                match observed_dimension {
1565                    None => observed_dimension = Some(dim),
1566                    Some(expected) if dim != expected => {
1567                        // Refuse to mix dimensions in one index. Caller should
1568                        // fall back to a full rebuild.
1569                        return Err(format!(
1570                            "embedding dimension changed during incremental refresh: \
1571                             cached index uses {expected}, new vectors use {dim}"
1572                        ));
1573                    }
1574                    _ => {}
1575                }
1576            }
1577
1578            for (i, vector) in vectors.into_iter().enumerate() {
1579                let chunk_idx = batch_start + i;
1580                new_entries.push(EmbeddingEntry {
1581                    chunk: chunks[chunk_idx].clone(),
1582                    vector,
1583                });
1584            }
1585
1586            progress(new_entries.len(), total_chunks);
1587        }
1588
1589        let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1590        for file in &successful_files {
1591            self.deferred_files.remove(file);
1592        }
1593        if !successful_files.is_empty() {
1594            self.entries
1595                .retain(|entry| !successful_files.contains(&entry.chunk.file));
1596        }
1597
1598        self.entries.extend(new_entries);
1599        for (file, metadata) in fresh_metadata {
1600            self.file_mtimes.insert(file.clone(), metadata.mtime);
1601            self.file_sizes.insert(file.clone(), metadata.size);
1602            self.file_hashes.insert(file, metadata.content_hash);
1603        }
1604        if let Some(dim) = observed_dimension {
1605            self.dimension = dim;
1606        }
1607
1608        Ok(RefreshSummary {
1609            changed: changed
1610                .iter()
1611                .filter(|path| successful_files.contains(*path))
1612                .count(),
1613            added: added
1614                .iter()
1615                .filter(|path| successful_files.contains(*path))
1616                .count(),
1617            deleted: deleted.len(),
1618            total_processed,
1619        })
1620    }
1621
1622    /// Refresh exactly the files invalidated by the live watcher, without
1623    /// treating the provided path list as the whole project. This is the
1624    /// watcher-side counterpart to `refresh_stale_files`: it drops any stale
1625    /// entries for the requested paths from this in-memory index, re-extracts
1626    /// whatever still exists on disk, embeds those chunks, and returns the
1627    /// delta needed for another in-memory index to apply the same update.
1628    pub fn refresh_invalidated_files<F, P>(
1629        &mut self,
1630        project_root: &Path,
1631        paths: &[PathBuf],
1632        embed_fn: &mut F,
1633        max_batch_size: usize,
1634        max_files: usize,
1635        progress: &mut P,
1636    ) -> Result<InvalidatedFilesRefresh, String>
1637    where
1638        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1639        P: FnMut(usize, usize),
1640    {
1641        self.backfill_missing_file_sizes();
1642
1643        self.deferred_files.retain(|path| path.exists());
1644        let mut requested_paths = paths.to_vec();
1645        requested_paths.extend(self.deferred_files.iter().cloned());
1646        requested_paths.sort();
1647        requested_paths.dedup();
1648        let total_processed = requested_paths.len();
1649
1650        if requested_paths.is_empty() {
1651            progress(0, 0);
1652            return Ok(InvalidatedFilesRefresh {
1653                summary: RefreshSummary {
1654                    total_processed,
1655                    ..RefreshSummary::default()
1656                },
1657                ..InvalidatedFilesRefresh::default()
1658            });
1659        }
1660
1661        let previously_indexed: HashSet<PathBuf> = requested_paths
1662            .iter()
1663            .filter(|path| self.file_mtimes.contains_key(*path))
1664            .cloned()
1665            .collect();
1666
1667        // The watcher path has already invalidated these files in the request
1668        // thread's live index. Mirror that behavior here before inserting any
1669        // fresh chunks so parse/read failures do not resurrect stale entries.
1670        self.remove_indexed_files(&requested_paths);
1671
1672        let existing_paths = requested_paths
1673            .iter()
1674            .filter(|path| path.exists())
1675            .cloned()
1676            .collect::<Vec<_>>();
1677        let deleted = requested_paths
1678            .iter()
1679            .filter(|path| !path.exists() && previously_indexed.contains(path.as_path()))
1680            .count();
1681
1682        if existing_paths.is_empty() {
1683            for path in &requested_paths {
1684                if !path.exists() {
1685                    self.deferred_files.remove(path);
1686                }
1687            }
1688            progress(0, 0);
1689            return Ok(InvalidatedFilesRefresh {
1690                completed_paths: requested_paths,
1691                summary: RefreshSummary {
1692                    deleted,
1693                    total_processed,
1694                    ..RefreshSummary::default()
1695                },
1696                ..InvalidatedFilesRefresh::default()
1697            });
1698        }
1699
1700        let (mut chunks, mut fresh_metadata) = Self::collect_chunks(project_root, &existing_paths);
1701
1702        let retained_file_count = self.file_mtimes.len();
1703        let changed_successful_count = existing_paths
1704            .iter()
1705            .filter(|path| {
1706                previously_indexed.contains(path.as_path()) && fresh_metadata.contains_key(*path)
1707            })
1708            .count();
1709        let available_new_files =
1710            max_files.saturating_sub(retained_file_count.saturating_add(changed_successful_count));
1711        let new_successful_files = existing_paths
1712            .iter()
1713            .filter(|path| {
1714                !previously_indexed.contains(path.as_path()) && fresh_metadata.contains_key(*path)
1715            })
1716            .cloned()
1717            .collect::<Vec<_>>();
1718        if new_successful_files.len() > available_new_files {
1719            let allowed_new_files = new_successful_files
1720                .iter()
1721                .take(available_new_files)
1722                .cloned()
1723                .collect::<HashSet<_>>();
1724            let deferred_new_files = new_successful_files
1725                .into_iter()
1726                .filter(|path| !allowed_new_files.contains(path))
1727                .collect::<HashSet<_>>();
1728
1729            fresh_metadata.retain(|file, _| {
1730                previously_indexed.contains(file.as_path()) || allowed_new_files.contains(file)
1731            });
1732            chunks.retain(|chunk| !deferred_new_files.contains(&chunk.file));
1733
1734            if !deferred_new_files.is_empty() {
1735                for path in &deferred_new_files {
1736                    self.deferred_files.insert(path.clone());
1737                }
1738                slog_warn!(
1739                    "semantic refresh deferred {} new file(s): indexed-file cap {} is reached",
1740                    deferred_new_files.len(),
1741                    max_files
1742                );
1743            }
1744        }
1745
1746        let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1747        for file in &successful_files {
1748            self.deferred_files.remove(file);
1749        }
1750        let changed = successful_files
1751            .iter()
1752            .filter(|path| previously_indexed.contains(path.as_path()))
1753            .count();
1754        let added = successful_files.len().saturating_sub(changed);
1755        let mut updated_metadata = Vec::with_capacity(fresh_metadata.len());
1756
1757        if chunks.is_empty() {
1758            progress(0, 0);
1759            for (file, metadata) in fresh_metadata {
1760                let freshness = FileFreshness {
1761                    mtime: metadata.mtime,
1762                    size: metadata.size,
1763                    content_hash: metadata.content_hash,
1764                };
1765                self.file_mtimes.insert(file.clone(), freshness.mtime);
1766                self.file_sizes.insert(file.clone(), freshness.size);
1767                self.file_hashes
1768                    .insert(file.clone(), freshness.content_hash);
1769                updated_metadata.push((file, freshness));
1770            }
1771
1772            return Ok(InvalidatedFilesRefresh {
1773                updated_metadata,
1774                completed_paths: requested_paths,
1775                summary: RefreshSummary {
1776                    changed,
1777                    added,
1778                    deleted,
1779                    total_processed,
1780                },
1781                ..InvalidatedFilesRefresh::default()
1782            });
1783        }
1784
1785        let total_chunks = chunks.len();
1786        progress(0, total_chunks);
1787        let batch_size = max_batch_size.max(1);
1788        let mut observed_dimension = if self.entries.is_empty() && previously_indexed.is_empty() {
1789            None
1790        } else {
1791            Some(self.dimension)
1792        };
1793        let mut new_entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1794
1795        for batch_start in (0..chunks.len()).step_by(batch_size) {
1796            let batch_end = (batch_start + batch_size).min(chunks.len());
1797            let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1798                .iter()
1799                .map(|chunk| chunk.embed_text.clone())
1800                .collect();
1801
1802            let vectors = embed_fn(batch_texts)?;
1803            validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1804
1805            if let Some(dim) = vectors.first().map(|vector| vector.len()) {
1806                match observed_dimension {
1807                    None => observed_dimension = Some(dim),
1808                    Some(expected) if dim != expected => {
1809                        return Err(format!(
1810                            "embedding dimension changed during invalidated-file refresh: \
1811                             cached index uses {expected}, new vectors use {dim}"
1812                        ));
1813                    }
1814                    _ => {}
1815                }
1816            }
1817
1818            for (i, vector) in vectors.into_iter().enumerate() {
1819                let chunk_idx = batch_start + i;
1820                new_entries.push(EmbeddingEntry {
1821                    chunk: chunks[chunk_idx].clone(),
1822                    vector,
1823                });
1824            }
1825
1826            progress(new_entries.len(), total_chunks);
1827        }
1828
1829        let added_entries = new_entries.clone();
1830        self.entries.extend(new_entries);
1831        for (file, metadata) in fresh_metadata {
1832            let freshness = FileFreshness {
1833                mtime: metadata.mtime,
1834                size: metadata.size,
1835                content_hash: metadata.content_hash,
1836            };
1837            self.file_mtimes.insert(file.clone(), freshness.mtime);
1838            self.file_sizes.insert(file.clone(), freshness.size);
1839            self.file_hashes
1840                .insert(file.clone(), freshness.content_hash);
1841            updated_metadata.push((file, freshness));
1842        }
1843        if let Some(dim) = observed_dimension {
1844            self.dimension = dim;
1845        }
1846
1847        Ok(InvalidatedFilesRefresh {
1848            added_entries,
1849            updated_metadata,
1850            completed_paths: requested_paths,
1851            summary: RefreshSummary {
1852                changed,
1853                added,
1854                deleted,
1855                total_processed,
1856            },
1857        })
1858    }
1859
1860    pub fn apply_refresh_update(
1861        &mut self,
1862        added_entries: Vec<EmbeddingEntry>,
1863        updated_metadata: Vec<(PathBuf, FileFreshness)>,
1864        completed_paths: &[PathBuf],
1865    ) {
1866        self.remove_indexed_files(completed_paths);
1867
1868        let observed_dimension = added_entries.first().map(|entry| entry.vector.len());
1869        self.entries.extend(added_entries);
1870        for (file, freshness) in updated_metadata {
1871            self.file_mtimes.insert(file.clone(), freshness.mtime);
1872            self.file_sizes.insert(file.clone(), freshness.size);
1873            self.file_hashes.insert(file, freshness.content_hash);
1874        }
1875        if let Some(dim) = observed_dimension {
1876            self.dimension = dim;
1877        }
1878    }
1879
1880    fn remove_indexed_files(&mut self, files: &[PathBuf]) {
1881        let deleted_set: HashSet<&Path> = files.iter().map(PathBuf::as_path).collect();
1882        self.entries
1883            .retain(|entry| !deleted_set.contains(entry.chunk.file.as_path()));
1884        for path in files {
1885            self.file_mtimes.remove(path);
1886            self.file_sizes.remove(path);
1887            self.file_hashes.remove(path);
1888        }
1889    }
1890
1891    /// Search the index with a query embedding, returning top-K results sorted by relevance
1892    pub fn search(&self, query_vector: &[f32], top_k: usize) -> Vec<SemanticResult> {
1893        if self.entries.is_empty() || query_vector.len() != self.dimension {
1894            return Vec::new();
1895        }
1896
1897        let mut scored: Vec<(f32, usize)> = self
1898            .entries
1899            .iter()
1900            .enumerate()
1901            .map(|(i, entry)| {
1902                let mut score = cosine_similarity(query_vector, &entry.vector);
1903                if entry.chunk.exported {
1904                    score *= 1.1;
1905                }
1906                (score, i)
1907            })
1908            .collect();
1909
1910        // Sort descending by score
1911        scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
1912
1913        scored
1914            .into_iter()
1915            .take(top_k)
1916            // Keep the sort → take → map ordering explicit: removing the old
1917            // `> 0.0` floor cannot evict positive hits because top_k has already
1918            // been selected, but it can surface zero-score noise in the tail.
1919            .map(|(score, idx)| {
1920                let entry = &self.entries[idx];
1921                SemanticResult {
1922                    file: entry.chunk.file.clone(),
1923                    name: entry.chunk.name.clone(),
1924                    kind: entry.chunk.kind.clone(),
1925                    start_line: entry.chunk.start_line,
1926                    end_line: entry.chunk.end_line,
1927                    exported: entry.chunk.exported,
1928                    snippet: entry.chunk.snippet.clone(),
1929                    score,
1930                    source: "semantic",
1931                }
1932            })
1933            .collect()
1934    }
1935
1936    /// Number of indexed entries
1937    pub fn len(&self) -> usize {
1938        self.entries.len()
1939    }
1940
1941    /// Check if a file needs re-indexing based on mtime/size
1942    pub fn is_file_stale(&self, file: &Path) -> bool {
1943        let Some(stored_mtime) = self.file_mtimes.get(file) else {
1944            return true;
1945        };
1946        let Some(stored_size) = self.file_sizes.get(file) else {
1947            return true;
1948        };
1949        let Some(stored_hash) = self.file_hashes.get(file) else {
1950            return true;
1951        };
1952        let cached = FileFreshness {
1953            mtime: *stored_mtime,
1954            size: *stored_size,
1955            content_hash: *stored_hash,
1956        };
1957        match cache_freshness::verify_file_strict(file, &cached) {
1958            FreshnessVerdict::HotFresh => false,
1959            FreshnessVerdict::ContentFresh { .. } => false,
1960            FreshnessVerdict::Stale | FreshnessVerdict::Deleted => true,
1961        }
1962    }
1963
1964    fn backfill_missing_file_sizes(&mut self) {
1965        for path in self.file_mtimes.keys() {
1966            if self.file_sizes.contains_key(path) {
1967                continue;
1968            }
1969            if let Ok(metadata) = fs::metadata(path) {
1970                self.file_sizes.insert(path.clone(), metadata.len());
1971                if let Ok(Some(hash)) = cache_freshness::hash_file_if_small(path, metadata.len()) {
1972                    self.file_hashes.insert(path.clone(), hash);
1973                }
1974            }
1975        }
1976    }
1977
1978    /// Remove entries for a specific file
1979    pub fn remove_file(&mut self, file: &Path) {
1980        self.invalidate_file(file);
1981    }
1982
1983    pub fn invalidate_file(&mut self, file: &Path) {
1984        let canonical_file = canonicalize_existing_or_deleted_path(file);
1985        self.entries
1986            .retain(|e| e.chunk.file != file && e.chunk.file != canonical_file);
1987        self.file_mtimes.remove(file);
1988        self.file_sizes.remove(file);
1989        self.file_hashes.remove(file);
1990        if canonical_file.as_path() != file {
1991            self.file_mtimes.remove(&canonical_file);
1992            self.file_sizes.remove(&canonical_file);
1993            self.file_hashes.remove(&canonical_file);
1994        }
1995    }
1996
1997    /// Get the embedding dimension
1998    pub fn dimension(&self) -> usize {
1999        self.dimension
2000    }
2001
2002    pub fn fingerprint(&self) -> Option<&SemanticIndexFingerprint> {
2003        self.fingerprint.as_ref()
2004    }
2005
2006    pub fn backend_label(&self) -> Option<&str> {
2007        self.fingerprint.as_ref().map(|f| f.backend.as_str())
2008    }
2009
2010    pub fn model_label(&self) -> Option<&str> {
2011        self.fingerprint.as_ref().map(|f| f.model.as_str())
2012    }
2013
2014    pub fn set_fingerprint(&mut self, fingerprint: SemanticIndexFingerprint) {
2015        self.fingerprint = Some(fingerprint);
2016    }
2017
2018    /// Write the semantic index to disk using atomic temp+rename pattern
2019    pub fn write_to_disk(&self, storage_dir: &Path, project_key: &str) {
2020        // Don't persist empty indexes — they would be loaded on next startup
2021        // and prevent a fresh build that might find files.
2022        if self.entries.is_empty() {
2023            slog_info!("skipping semantic index persistence (0 entries)");
2024            return;
2025        }
2026        let dir = storage_dir.join("semantic").join(project_key);
2027        if let Err(e) = fs::create_dir_all(&dir) {
2028            slog_warn!("failed to create semantic cache dir: {}", e);
2029            return;
2030        }
2031        let data_path = dir.join("semantic.bin");
2032        let tmp_path = dir.join(format!(
2033            "semantic.bin.tmp.{}.{}",
2034            std::process::id(),
2035            SystemTime::now()
2036                .duration_since(SystemTime::UNIX_EPOCH)
2037                .unwrap_or(Duration::ZERO)
2038                .as_nanos()
2039        ));
2040        let bytes = self.to_bytes();
2041        let write_result = (|| -> std::io::Result<()> {
2042            use std::io::Write;
2043            let mut file = fs::File::create(&tmp_path)?;
2044            file.write_all(&bytes)?;
2045            file.sync_all()?;
2046            Ok(())
2047        })();
2048        if let Err(e) = write_result {
2049            slog_warn!("failed to write semantic index: {}", e);
2050            let _ = fs::remove_file(&tmp_path);
2051            return;
2052        }
2053        if let Err(e) = fs::rename(&tmp_path, &data_path) {
2054            slog_warn!("failed to rename semantic index: {}", e);
2055            let _ = fs::remove_file(&tmp_path);
2056            return;
2057        }
2058        slog_info!(
2059            "semantic index persisted: {} entries, {:.1} KB",
2060            self.entries.len(),
2061            bytes.len() as f64 / 1024.0
2062        );
2063    }
2064
2065    /// Read the semantic index from disk
2066    pub fn read_from_disk(
2067        storage_dir: &Path,
2068        project_key: &str,
2069        current_canonical_root: &Path,
2070        is_worktree_bridge: bool,
2071        expected_fingerprint: Option<&str>,
2072    ) -> Option<Self> {
2073        debug_assert!(current_canonical_root.is_absolute());
2074        let data_path = storage_dir
2075            .join("semantic")
2076            .join(project_key)
2077            .join("semantic.bin");
2078        let file_len = usize::try_from(fs::metadata(&data_path).ok()?.len()).ok()?;
2079        if file_len < HEADER_BYTES_V1 {
2080            slog_warn!(
2081                "corrupt semantic index (too small: {} bytes), removing",
2082                file_len
2083            );
2084            if !is_worktree_bridge {
2085                let _ = fs::remove_file(&data_path);
2086            }
2087            return None;
2088        }
2089
2090        let bytes = fs::read(&data_path).ok()?;
2091        let version = bytes[0];
2092        if version != SEMANTIC_INDEX_VERSION_V6 {
2093            slog_info!(
2094                "cached semantic index version {} is older than {}, rebuilding",
2095                version,
2096                SEMANTIC_INDEX_VERSION_V6
2097            );
2098            if !is_worktree_bridge {
2099                let _ = fs::remove_file(&data_path);
2100            }
2101            return None;
2102        }
2103        match Self::from_bytes(&bytes, current_canonical_root) {
2104            Ok(index) => {
2105                if index.entries.is_empty() {
2106                    slog_info!("cached semantic index is empty, will rebuild");
2107                    if !is_worktree_bridge {
2108                        let _ = fs::remove_file(&data_path);
2109                    }
2110                    return None;
2111                }
2112                if let Some(expected) = expected_fingerprint {
2113                    let matches = index
2114                        .fingerprint()
2115                        .map(|fingerprint| fingerprint.matches_expected(expected))
2116                        .unwrap_or(false);
2117                    if !matches {
2118                        slog_info!("cached semantic index fingerprint mismatch, rebuilding");
2119                        if !is_worktree_bridge {
2120                            let _ = fs::remove_file(&data_path);
2121                        }
2122                        return None;
2123                    }
2124                }
2125                slog_info!(
2126                    "loaded semantic index from disk: {} entries",
2127                    index.entries.len()
2128                );
2129                Some(index)
2130            }
2131            Err(e) => {
2132                slog_warn!("corrupt semantic index, rebuilding: {}", e);
2133                if !is_worktree_bridge {
2134                    let _ = fs::remove_file(&data_path);
2135                }
2136                None
2137            }
2138        }
2139    }
2140
2141    /// Serialize the index to bytes for disk persistence
2142    pub fn to_bytes(&self) -> Vec<u8> {
2143        let mut buf = Vec::new();
2144        let fingerprint_bytes = self.fingerprint.as_ref().and_then(|fingerprint| {
2145            let encoded = fingerprint.as_string();
2146            if encoded.is_empty() {
2147                None
2148            } else {
2149                Some(encoded.into_bytes())
2150            }
2151        });
2152        let file_mtimes: Vec<_> = self
2153            .file_mtimes
2154            .iter()
2155            .filter_map(|(path, mtime)| {
2156                cache_relative_path(&self.project_root, path)
2157                    .map(|relative| (relative, path, mtime))
2158            })
2159            .collect();
2160        let entries: Vec<_> = self
2161            .entries
2162            .iter()
2163            .filter_map(|entry| {
2164                cache_relative_path(&self.project_root, &entry.chunk.file)
2165                    .map(|relative| (relative, entry))
2166            })
2167            .collect();
2168
2169        // Header: version(1) + dimension(4) + entry_count(4) + fingerprint_len(4) + fingerprint
2170        //
2171        // V6 is the single write format. Layout extends V5:
2172        //   - fingerprint is always represented (absent ⇒ fingerprint_len=0,
2173        //     no bytes follow). Uniform format simplifies the reader.
2174        //   - paths are relative to project_root.
2175        //   - file metadata stored as secs(u64) + subsec_nanos(u32) + size(u64) + blake3(32).
2176        //     Preserves full APFS/ext4/NTFS precision and catches mtime ties.
2177        //
2178        // V1/V2 remain readable for backward compatibility (see from_bytes).
2179        // V3/V4 load as compatible formats but are rejected on disk so snippets
2180        // and file sizes are rebuilt once.
2181        let version = SEMANTIC_INDEX_VERSION_V6;
2182        buf.push(version);
2183        buf.extend_from_slice(&(self.dimension as u32).to_le_bytes());
2184        buf.extend_from_slice(&(entries.len() as u32).to_le_bytes());
2185        let fp_bytes_ref: &[u8] = fingerprint_bytes.as_deref().unwrap_or(&[]);
2186        buf.extend_from_slice(&(fp_bytes_ref.len() as u32).to_le_bytes());
2187        buf.extend_from_slice(fp_bytes_ref);
2188
2189        // File mtime table: count(4) + entries
2190        // V3 layout per entry: path_len(4) + path + secs(8) + subsec_nanos(4)
2191        buf.extend_from_slice(&(file_mtimes.len() as u32).to_le_bytes());
2192        for (relative, path, mtime) in &file_mtimes {
2193            let path_bytes = relative.to_string_lossy().as_bytes().to_vec();
2194            buf.extend_from_slice(&(path_bytes.len() as u32).to_le_bytes());
2195            buf.extend_from_slice(&path_bytes);
2196            let duration = mtime
2197                .duration_since(SystemTime::UNIX_EPOCH)
2198                .unwrap_or_default();
2199            buf.extend_from_slice(&duration.as_secs().to_le_bytes());
2200            buf.extend_from_slice(&duration.subsec_nanos().to_le_bytes());
2201            let size = self.file_sizes.get(*path).copied().unwrap_or_default();
2202            buf.extend_from_slice(&size.to_le_bytes());
2203            let hash = self
2204                .file_hashes
2205                .get(*path)
2206                .copied()
2207                .unwrap_or_else(cache_freshness::zero_hash);
2208            buf.extend_from_slice(hash.as_bytes());
2209        }
2210
2211        // Entries: each is metadata + vector
2212        for (relative, entry) in &entries {
2213            let c = &entry.chunk;
2214
2215            // File path
2216            let file_bytes = relative.to_string_lossy().as_bytes().to_vec();
2217            buf.extend_from_slice(&(file_bytes.len() as u32).to_le_bytes());
2218            buf.extend_from_slice(&file_bytes);
2219
2220            // Name
2221            let name_bytes = c.name.as_bytes();
2222            buf.extend_from_slice(&(name_bytes.len() as u32).to_le_bytes());
2223            buf.extend_from_slice(name_bytes);
2224
2225            // Kind (1 byte)
2226            buf.push(symbol_kind_to_u8(&c.kind));
2227
2228            // Lines + exported
2229            buf.extend_from_slice(&(c.start_line as u32).to_le_bytes());
2230            buf.extend_from_slice(&(c.end_line as u32).to_le_bytes());
2231            buf.push(c.exported as u8);
2232
2233            // Snippet
2234            let snippet_bytes = c.snippet.as_bytes();
2235            buf.extend_from_slice(&(snippet_bytes.len() as u32).to_le_bytes());
2236            buf.extend_from_slice(snippet_bytes);
2237
2238            // Embed text
2239            let embed_bytes = c.embed_text.as_bytes();
2240            buf.extend_from_slice(&(embed_bytes.len() as u32).to_le_bytes());
2241            buf.extend_from_slice(embed_bytes);
2242
2243            // Vector (f32 array)
2244            for &val in &entry.vector {
2245                buf.extend_from_slice(&val.to_le_bytes());
2246            }
2247        }
2248
2249        buf
2250    }
2251
2252    /// Deserialize the index from bytes
2253    pub fn from_bytes(data: &[u8], current_canonical_root: &Path) -> Result<Self, String> {
2254        debug_assert!(current_canonical_root.is_absolute());
2255        let mut pos = 0;
2256
2257        if data.len() < HEADER_BYTES_V1 {
2258            return Err("data too short".to_string());
2259        }
2260
2261        let version = data[pos];
2262        pos += 1;
2263        if version != SEMANTIC_INDEX_VERSION_V1
2264            && version != SEMANTIC_INDEX_VERSION_V2
2265            && version != SEMANTIC_INDEX_VERSION_V3
2266            && version != SEMANTIC_INDEX_VERSION_V4
2267            && version != SEMANTIC_INDEX_VERSION_V5
2268            && version != SEMANTIC_INDEX_VERSION_V6
2269        {
2270            return Err(format!("unsupported version: {}", version));
2271        }
2272        // V2 and newer share the same header layout (V3/V4/V5 only differ from
2273        // V2 in the per-mtime entry layout): version(1) + dimension(4) +
2274        // entry_count(4) + fingerprint_len(4) + fingerprint bytes.
2275        if (version == SEMANTIC_INDEX_VERSION_V2
2276            || version == SEMANTIC_INDEX_VERSION_V3
2277            || version == SEMANTIC_INDEX_VERSION_V4
2278            || version == SEMANTIC_INDEX_VERSION_V5
2279            || version == SEMANTIC_INDEX_VERSION_V6)
2280            && data.len() < HEADER_BYTES_V2
2281        {
2282            return Err("data too short for semantic index v2/v3/v4/v5/v6 header".to_string());
2283        }
2284
2285        let dimension = read_u32(data, &mut pos)? as usize;
2286        let entry_count = read_u32(data, &mut pos)? as usize;
2287        validate_embedding_dimension(dimension)?;
2288        if entry_count > MAX_ENTRIES {
2289            return Err(format!("too many semantic index entries: {}", entry_count));
2290        }
2291
2292        // Fingerprint handling:
2293        //   - V1: no fingerprint field at all.
2294        //   - V2: fingerprint_len + fingerprint bytes; always present (writer
2295        //     only emitted V2 when fingerprint was Some).
2296        //   - V3+: fingerprint_len always present; fingerprint_len==0 ⇒ None.
2297        let has_fingerprint_field = version == SEMANTIC_INDEX_VERSION_V2
2298            || version == SEMANTIC_INDEX_VERSION_V3
2299            || version == SEMANTIC_INDEX_VERSION_V4
2300            || version == SEMANTIC_INDEX_VERSION_V5
2301            || version == SEMANTIC_INDEX_VERSION_V6;
2302        let fingerprint = if has_fingerprint_field {
2303            let fingerprint_len = read_u32(data, &mut pos)? as usize;
2304            if pos + fingerprint_len > data.len() {
2305                return Err("unexpected end of data reading fingerprint".to_string());
2306            }
2307            if fingerprint_len == 0 {
2308                None
2309            } else {
2310                let raw = String::from_utf8_lossy(&data[pos..pos + fingerprint_len]).to_string();
2311                pos += fingerprint_len;
2312                Some(
2313                    serde_json::from_str::<SemanticIndexFingerprint>(&raw)
2314                        .map_err(|error| format!("invalid semantic fingerprint: {error}"))?,
2315                )
2316            }
2317        } else {
2318            None
2319        };
2320
2321        // File mtimes
2322        let mtime_count = read_u32(data, &mut pos)? as usize;
2323        if mtime_count > MAX_ENTRIES {
2324            return Err(format!("too many semantic file mtimes: {}", mtime_count));
2325        }
2326
2327        let vector_bytes = entry_count
2328            .checked_mul(dimension)
2329            .and_then(|count| count.checked_mul(F32_BYTES))
2330            .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
2331        if vector_bytes > data.len().saturating_sub(pos) {
2332            return Err("semantic index vectors exceed available data".to_string());
2333        }
2334
2335        let mut file_mtimes = HashMap::with_capacity(mtime_count);
2336        let mut file_sizes = HashMap::with_capacity(mtime_count);
2337        let mut file_hashes = HashMap::with_capacity(mtime_count);
2338        for _ in 0..mtime_count {
2339            let path = read_string(data, &mut pos)?;
2340            let secs = read_u64(data, &mut pos)?;
2341            // V3+ persists subsec_nanos alongside secs so staleness checks
2342            // survive restart round-trips. V1/V2 load with 0 nanos, which
2343            // causes one rebuild on upgrade (they never matched live APFS
2344            // mtimes anyway — the bug v0.15.2 fixes). After that rebuild,
2345            // the cache is persisted as V3 and stabilises.
2346            let nanos = if version == SEMANTIC_INDEX_VERSION_V3
2347                || version == SEMANTIC_INDEX_VERSION_V4
2348                || version == SEMANTIC_INDEX_VERSION_V5
2349                || version == SEMANTIC_INDEX_VERSION_V6
2350            {
2351                read_u32(data, &mut pos)?
2352            } else {
2353                0
2354            };
2355            let size =
2356                if version == SEMANTIC_INDEX_VERSION_V5 || version == SEMANTIC_INDEX_VERSION_V6 {
2357                    read_u64(data, &mut pos)?
2358                } else {
2359                    0
2360                };
2361            let content_hash = if version == SEMANTIC_INDEX_VERSION_V6 {
2362                if pos + 32 > data.len() {
2363                    return Err("unexpected end of data reading content hash".to_string());
2364                }
2365                let mut hash_bytes = [0u8; 32];
2366                hash_bytes.copy_from_slice(&data[pos..pos + 32]);
2367                pos += 32;
2368                blake3::Hash::from_bytes(hash_bytes)
2369            } else {
2370                cache_freshness::zero_hash()
2371            };
2372            // Hardening against corrupt / maliciously crafted cache files
2373            // (v0.15.2). `Duration::new(secs, nanos)` can panic when the
2374            // nanosecond carry overflows the second counter, and
2375            // `SystemTime + Duration` can panic on carry past the platform's
2376            // upper bound. Explicit validation keeps a corrupted semantic.bin
2377            // from taking down the whole aft process.
2378            if nanos >= 1_000_000_000 {
2379                return Err(format!(
2380                    "invalid semantic mtime: nanos {} >= 1_000_000_000",
2381                    nanos
2382                ));
2383            }
2384            let duration = std::time::Duration::new(secs, nanos);
2385            let mtime = SystemTime::UNIX_EPOCH
2386                .checked_add(duration)
2387                .ok_or_else(|| {
2388                    format!(
2389                        "invalid semantic mtime: secs={} nanos={} overflows SystemTime",
2390                        secs, nanos
2391                    )
2392                })?;
2393            let path = if version == SEMANTIC_INDEX_VERSION_V6 {
2394                cached_path_under_root(current_canonical_root, &PathBuf::from(path))
2395                    .ok_or_else(|| "cached semantic mtime path escapes project root".to_string())?
2396            } else {
2397                PathBuf::from(path)
2398            };
2399            file_mtimes.insert(path.clone(), mtime);
2400            file_sizes.insert(path.clone(), size);
2401            file_hashes.insert(path, content_hash);
2402        }
2403
2404        // Entries
2405        let mut entries = Vec::with_capacity(entry_count);
2406        for _ in 0..entry_count {
2407            let raw_file = PathBuf::from(read_string(data, &mut pos)?);
2408            let file = if version == SEMANTIC_INDEX_VERSION_V6 {
2409                cached_path_under_root(current_canonical_root, &raw_file)
2410                    .ok_or_else(|| "cached semantic entry path escapes project root".to_string())?
2411            } else {
2412                raw_file
2413            };
2414            let name = read_string(data, &mut pos)?;
2415
2416            if pos >= data.len() {
2417                return Err("unexpected end of data".to_string());
2418            }
2419            let kind = u8_to_symbol_kind(data[pos]);
2420            pos += 1;
2421
2422            let start_line = read_u32(data, &mut pos)?;
2423            let end_line = read_u32(data, &mut pos)?;
2424
2425            if pos >= data.len() {
2426                return Err("unexpected end of data".to_string());
2427            }
2428            let exported = data[pos] != 0;
2429            pos += 1;
2430
2431            let snippet = read_string(data, &mut pos)?;
2432            let embed_text = read_string(data, &mut pos)?;
2433
2434            // Vector
2435            let vec_bytes = dimension
2436                .checked_mul(F32_BYTES)
2437                .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
2438            if pos + vec_bytes > data.len() {
2439                return Err("unexpected end of data reading vector".to_string());
2440            }
2441            let mut vector = Vec::with_capacity(dimension);
2442            for _ in 0..dimension {
2443                let bytes = [data[pos], data[pos + 1], data[pos + 2], data[pos + 3]];
2444                vector.push(f32::from_le_bytes(bytes));
2445                pos += 4;
2446            }
2447
2448            entries.push(EmbeddingEntry {
2449                chunk: SemanticChunk {
2450                    file,
2451                    name,
2452                    kind,
2453                    start_line,
2454                    end_line,
2455                    exported,
2456                    embed_text,
2457                    snippet,
2458                },
2459                vector,
2460            });
2461        }
2462
2463        if entries.len() != entry_count {
2464            return Err(format!(
2465                "semantic cache entry count drift: header={} decoded={}",
2466                entry_count,
2467                entries.len()
2468            ));
2469        }
2470        for entry in &entries {
2471            if !file_mtimes.contains_key(&entry.chunk.file) {
2472                return Err(format!(
2473                    "semantic cache metadata missing for entry file {}",
2474                    entry.chunk.file.display()
2475                ));
2476            }
2477        }
2478
2479        Ok(Self {
2480            entries,
2481            file_mtimes,
2482            file_sizes,
2483            file_hashes,
2484            dimension,
2485            fingerprint,
2486            project_root: current_canonical_root.to_path_buf(),
2487            deferred_files: HashSet::new(),
2488        })
2489    }
2490}
2491
2492/// Build enriched embedding text from a symbol with cAST-style context
2493fn build_embed_text(symbol: &Symbol, source: &str, file: &Path, project_root: &Path) -> String {
2494    let relative = file
2495        .strip_prefix(project_root)
2496        .unwrap_or(file)
2497        .to_string_lossy();
2498
2499    let kind_label = match &symbol.kind {
2500        SymbolKind::Function => "function",
2501        SymbolKind::Class => "class",
2502        SymbolKind::Method => "method",
2503        SymbolKind::Struct => "struct",
2504        SymbolKind::Interface => "interface",
2505        SymbolKind::Enum => "enum",
2506        SymbolKind::TypeAlias => "type",
2507        SymbolKind::Variable => "variable",
2508        SymbolKind::Heading => "heading",
2509        SymbolKind::FileSummary => "file-summary",
2510    };
2511
2512    // Build: "file:relative/path kind:function name:validateAuth signature:fn validateAuth(token: &str) -> bool"
2513    let name = &symbol.name;
2514    let mut text = format!(
2515        "name:{name} file:{} kind:{} name:{name}",
2516        relative, kind_label
2517    );
2518
2519    if let Some(sig) = &symbol.signature {
2520        text.push_str(&format!(" signature:{}", sig));
2521    }
2522
2523    // Add body snippet (first ~300 chars of symbol body)
2524    let lines: Vec<&str> = source.lines().collect();
2525    let start = (symbol.range.start_line as usize).min(lines.len());
2526    // range.end_line is inclusive 0-based; +1 makes it an exclusive slice bound.
2527    let end = (symbol.range.end_line as usize + 1).min(lines.len());
2528    if start < end {
2529        let body: String = lines[start..end]
2530            .iter()
2531            .take(15) // max 15 lines
2532            .copied()
2533            .collect::<Vec<&str>>()
2534            .join("\n");
2535        let snippet = if body.len() > 300 {
2536            format!("{}...", &body[..body.floor_char_boundary(300)])
2537        } else {
2538            body
2539        };
2540        text.push_str(&format!(" body:{}", snippet));
2541    }
2542
2543    text
2544}
2545
2546fn truncate_chars(value: &str, max_chars: usize) -> String {
2547    value.chars().take(max_chars).collect()
2548}
2549
2550fn first_leading_doc_comment(source: &str) -> String {
2551    let lines: Vec<&str> = source.lines().collect();
2552    let Some((start, first)) = lines
2553        .iter()
2554        .enumerate()
2555        .find(|(_, line)| !line.trim().is_empty())
2556    else {
2557        return String::new();
2558    };
2559
2560    let trimmed = first.trim_start();
2561    if trimmed.starts_with("/**") {
2562        let mut comment = Vec::new();
2563        for line in lines.iter().skip(start) {
2564            comment.push(*line);
2565            if line.contains("*/") {
2566                break;
2567            }
2568        }
2569        return truncate_chars(&comment.join("\n"), 200);
2570    }
2571
2572    if trimmed.starts_with("///") || trimmed.starts_with("//!") {
2573        let comment = lines
2574            .iter()
2575            .skip(start)
2576            .take_while(|line| {
2577                let trimmed = line.trim_start();
2578                trimmed.starts_with("///") || trimmed.starts_with("//!")
2579            })
2580            .copied()
2581            .collect::<Vec<_>>()
2582            .join("\n");
2583        return truncate_chars(&comment, 200);
2584    }
2585
2586    String::new()
2587}
2588
2589pub fn build_file_summary_chunk(
2590    file: &Path,
2591    project_root: &Path,
2592    source: &str,
2593    top_exports: &[&str],
2594    top_export_signatures: &[Option<&str>],
2595) -> SemanticChunk {
2596    let relative = file.strip_prefix(project_root).unwrap_or(file);
2597    let rel_path = relative.to_string_lossy();
2598    let parent_dir = relative
2599        .parent()
2600        .map(|parent| parent.to_string_lossy().to_string())
2601        .unwrap_or_default();
2602    let name = file
2603        .file_stem()
2604        .map(|stem| stem.to_string_lossy().to_string())
2605        .unwrap_or_default();
2606    let doc = first_leading_doc_comment(source);
2607    let exports = top_exports
2608        .iter()
2609        .take(5)
2610        .copied()
2611        .collect::<Vec<_>>()
2612        .join(",");
2613    let snippet = if doc.is_empty() {
2614        top_export_signatures
2615            .first()
2616            .and_then(|signature| signature.as_deref())
2617            .map(|signature| truncate_chars(signature, 200))
2618            .unwrap_or_default()
2619    } else {
2620        doc.clone()
2621    };
2622
2623    SemanticChunk {
2624        file: file.to_path_buf(),
2625        name,
2626        kind: SymbolKind::FileSummary,
2627        start_line: 0,
2628        end_line: 0,
2629        exported: false,
2630        embed_text: format!(
2631            "file:{rel_path} kind:file-summary name:{} parent:{parent_dir} doc:{doc} exports:{exports}",
2632            file.file_stem()
2633                .map(|stem| stem.to_string_lossy().to_string())
2634                .unwrap_or_default()
2635        ),
2636        snippet,
2637    }
2638}
2639
2640fn parser_for(
2641    parsers: &mut HashMap<crate::parser::LangId, Parser>,
2642    lang: crate::parser::LangId,
2643) -> Result<&mut Parser, String> {
2644    use std::collections::hash_map::Entry;
2645
2646    match parsers.entry(lang) {
2647        Entry::Occupied(entry) => Ok(entry.into_mut()),
2648        Entry::Vacant(entry) => {
2649            let grammar = grammar_for(lang);
2650            let mut parser = Parser::new();
2651            parser
2652                .set_language(&grammar)
2653                .map_err(|error| error.to_string())?;
2654            Ok(entry.insert(parser))
2655        }
2656    }
2657}
2658
2659pub fn is_semantic_indexed_extension(path: &Path) -> bool {
2660    matches!(
2661        path.extension().and_then(|extension| extension.to_str()),
2662        Some(
2663            "ts" | "tsx"
2664                | "js"
2665                | "jsx"
2666                | "py"
2667                | "rs"
2668                | "go"
2669                | "c"
2670                | "h"
2671                | "cc"
2672                | "cpp"
2673                | "cxx"
2674                | "hpp"
2675                | "hh"
2676                | "zig"
2677                | "cs"
2678                | "sh"
2679                | "bash"
2680                | "zsh"
2681                | "sol"
2682                | "vue"
2683                | "yaml"
2684                | "yml"
2685        )
2686    )
2687}
2688
2689fn collect_file_metadata(file: &Path) -> Result<IndexedFileMetadata, String> {
2690    let metadata = fs::metadata(file).map_err(|error| error.to_string())?;
2691    let mtime = metadata.modified().map_err(|error| error.to_string())?;
2692    let content_hash = cache_freshness::hash_file_if_small(file, metadata.len())
2693        .map_err(|error| error.to_string())?
2694        .unwrap_or_else(cache_freshness::zero_hash);
2695    Ok(IndexedFileMetadata {
2696        mtime,
2697        size: metadata.len(),
2698        content_hash,
2699    })
2700}
2701
2702fn canonicalize_existing_or_deleted_path(path: &Path) -> PathBuf {
2703    if let Ok(canonical) = fs::canonicalize(path) {
2704        return canonical;
2705    }
2706
2707    let Some(parent) = path.parent() else {
2708        return path.to_path_buf();
2709    };
2710    let Some(file_name) = path.file_name() else {
2711        return path.to_path_buf();
2712    };
2713
2714    fs::canonicalize(parent)
2715        .map(|canonical_parent| canonical_parent.join(file_name))
2716        .unwrap_or_else(|_| path.to_path_buf())
2717}
2718
2719fn collect_file_chunks(
2720    project_root: &Path,
2721    file: &Path,
2722    parsers: &mut HashMap<crate::parser::LangId, Parser>,
2723) -> Result<Vec<SemanticChunk>, String> {
2724    if !is_semantic_indexed_extension(file) {
2725        return Err("unsupported file extension".to_string());
2726    }
2727    let lang = detect_language(file).ok_or_else(|| "unsupported file extension".to_string())?;
2728    let source = std::fs::read_to_string(file).map_err(|error| error.to_string())?;
2729    let tree = parser_for(parsers, lang)?
2730        .parse(&source, None)
2731        .ok_or_else(|| format!("tree-sitter parse returned None for {}", file.display()))?;
2732    let symbols =
2733        extract_symbols_from_tree(&source, &tree, lang).map_err(|error| error.to_string())?;
2734
2735    Ok(symbols_to_chunks(file, &symbols, &source, project_root))
2736}
2737
2738/// Build a display snippet from a symbol's source
2739fn build_snippet(symbol: &Symbol, source: &str) -> String {
2740    let lines: Vec<&str> = source.lines().collect();
2741    let start = (symbol.range.start_line as usize).min(lines.len());
2742    // range.end_line is inclusive 0-based; +1 makes it an exclusive slice bound.
2743    let end = (symbol.range.end_line as usize + 1).min(lines.len());
2744    if start < end {
2745        let snippet_lines: Vec<&str> = lines[start..end].iter().take(5).copied().collect();
2746        let mut snippet = snippet_lines.join("\n");
2747        if end - start > 5 {
2748            snippet.push_str("\n  ...");
2749        }
2750        if snippet.len() > 300 {
2751            snippet = format!("{}...", &snippet[..snippet.floor_char_boundary(300)]);
2752        }
2753        snippet
2754    } else {
2755        String::new()
2756    }
2757}
2758
2759/// Convert symbols to semantic chunks with enriched context
2760fn symbols_to_chunks(
2761    file: &Path,
2762    symbols: &[Symbol],
2763    source: &str,
2764    project_root: &Path,
2765) -> Vec<SemanticChunk> {
2766    let mut chunks = Vec::new();
2767    let top_exports_with_signatures = symbols
2768        .iter()
2769        .filter(|symbol| {
2770            symbol.exported
2771                && symbol.parent.is_none()
2772                && !matches!(symbol.kind, SymbolKind::Heading)
2773        })
2774        .map(|symbol| (symbol.name.as_str(), symbol.signature.as_deref()))
2775        .collect::<Vec<_>>();
2776
2777    let has_only_headings = !symbols.is_empty()
2778        && symbols
2779            .iter()
2780            .all(|symbol| matches!(symbol.kind, SymbolKind::Heading));
2781    if top_exports_with_signatures.len() <= 2 && !has_only_headings {
2782        let top_exports = top_exports_with_signatures
2783            .iter()
2784            .map(|(name, _)| *name)
2785            .collect::<Vec<_>>();
2786        let top_export_signatures = top_exports_with_signatures
2787            .iter()
2788            .map(|(_, signature)| *signature)
2789            .collect::<Vec<_>>();
2790        chunks.push(build_file_summary_chunk(
2791            file,
2792            project_root,
2793            source,
2794            &top_exports,
2795            &top_export_signatures,
2796        ));
2797    }
2798
2799    for symbol in symbols {
2800        // Skip Markdown / HTML heading chunks: empirically they dominate result
2801        // lists even for code-shaped queries because heading prose embeds well.
2802        // Agents querying for code lose the actual matches under doc noise.
2803        // README/docs queries are still served by grep on the same files.
2804        if matches!(symbol.kind, SymbolKind::Heading) {
2805            continue;
2806        }
2807
2808        // Skip very small symbols (single-line variables, etc.)
2809        let line_count = symbol
2810            .range
2811            .end_line
2812            .saturating_sub(symbol.range.start_line)
2813            + 1;
2814        if line_count < 2 && !matches!(symbol.kind, SymbolKind::Variable) {
2815            continue;
2816        }
2817
2818        let embed_text = build_embed_text(symbol, source, file, project_root);
2819        let snippet = build_snippet(symbol, source);
2820
2821        chunks.push(SemanticChunk {
2822            file: file.to_path_buf(),
2823            name: symbol.name.clone(),
2824            kind: symbol.kind.clone(),
2825            start_line: symbol.range.start_line,
2826            end_line: symbol.range.end_line,
2827            exported: symbol.exported,
2828            embed_text,
2829            snippet,
2830        });
2831
2832        // Note: Nested symbols are handled separately by the outline system
2833        // Each symbol is indexed individually
2834    }
2835
2836    chunks
2837}
2838
2839/// Cosine similarity between two vectors
2840fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
2841    if a.len() != b.len() {
2842        return 0.0;
2843    }
2844
2845    let mut dot = 0.0f32;
2846    let mut norm_a = 0.0f32;
2847    let mut norm_b = 0.0f32;
2848
2849    for i in 0..a.len() {
2850        dot += a[i] * b[i];
2851        norm_a += a[i] * a[i];
2852        norm_b += b[i] * b[i];
2853    }
2854
2855    let denom = norm_a.sqrt() * norm_b.sqrt();
2856    if denom == 0.0 {
2857        0.0
2858    } else {
2859        dot / denom
2860    }
2861}
2862
2863// Serialization helpers
2864fn symbol_kind_to_u8(kind: &SymbolKind) -> u8 {
2865    match kind {
2866        SymbolKind::Function => 0,
2867        SymbolKind::Class => 1,
2868        SymbolKind::Method => 2,
2869        SymbolKind::Struct => 3,
2870        SymbolKind::Interface => 4,
2871        SymbolKind::Enum => 5,
2872        SymbolKind::TypeAlias => 6,
2873        SymbolKind::Variable => 7,
2874        SymbolKind::Heading => 8,
2875        SymbolKind::FileSummary => 9,
2876    }
2877}
2878
2879fn u8_to_symbol_kind(v: u8) -> SymbolKind {
2880    match v {
2881        0 => SymbolKind::Function,
2882        1 => SymbolKind::Class,
2883        2 => SymbolKind::Method,
2884        3 => SymbolKind::Struct,
2885        4 => SymbolKind::Interface,
2886        5 => SymbolKind::Enum,
2887        6 => SymbolKind::TypeAlias,
2888        7 => SymbolKind::Variable,
2889        8 => SymbolKind::Heading,
2890        9 => SymbolKind::FileSummary,
2891        _ => SymbolKind::Heading,
2892    }
2893}
2894
2895fn read_u32(data: &[u8], pos: &mut usize) -> Result<u32, String> {
2896    if *pos + 4 > data.len() {
2897        return Err("unexpected end of data reading u32".to_string());
2898    }
2899    let val = u32::from_le_bytes([data[*pos], data[*pos + 1], data[*pos + 2], data[*pos + 3]]);
2900    *pos += 4;
2901    Ok(val)
2902}
2903
2904fn read_u64(data: &[u8], pos: &mut usize) -> Result<u64, String> {
2905    if *pos + 8 > data.len() {
2906        return Err("unexpected end of data reading u64".to_string());
2907    }
2908    let bytes: [u8; 8] = data[*pos..*pos + 8].try_into().unwrap();
2909    *pos += 8;
2910    Ok(u64::from_le_bytes(bytes))
2911}
2912
2913fn read_string(data: &[u8], pos: &mut usize) -> Result<String, String> {
2914    let len = read_u32(data, pos)? as usize;
2915    if *pos + len > data.len() {
2916        return Err("unexpected end of data reading string".to_string());
2917    }
2918    let s = String::from_utf8_lossy(&data[*pos..*pos + len]).to_string();
2919    *pos += len;
2920    Ok(s)
2921}
2922
2923#[cfg(test)]
2924mod tests {
2925    use super::*;
2926    use crate::config::{SemanticBackend, SemanticBackendConfig};
2927    use crate::parser::FileParser;
2928    use std::io::{Read, Write};
2929    use std::net::TcpListener;
2930    use std::thread;
2931
2932    fn start_mock_http_server<F>(handler: F) -> (String, thread::JoinHandle<()>)
2933    where
2934        F: Fn(String, String, String) -> String + Send + 'static,
2935    {
2936        let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
2937        let addr = listener.local_addr().expect("local addr");
2938        let handle = thread::spawn(move || {
2939            let (mut stream, _) = listener.accept().expect("accept request");
2940            let mut buf = Vec::new();
2941            let mut chunk = [0u8; 4096];
2942            let mut header_end = None;
2943            let mut content_length = 0usize;
2944            loop {
2945                let n = stream.read(&mut chunk).expect("read request");
2946                if n == 0 {
2947                    break;
2948                }
2949                buf.extend_from_slice(&chunk[..n]);
2950                if header_end.is_none() {
2951                    if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
2952                        header_end = Some(pos + 4);
2953                        let headers = String::from_utf8_lossy(&buf[..pos + 4]);
2954                        for line in headers.lines() {
2955                            if let Some(value) = line.strip_prefix("Content-Length:") {
2956                                content_length = value.trim().parse::<usize>().unwrap_or(0);
2957                            }
2958                        }
2959                    }
2960                }
2961                if let Some(end) = header_end {
2962                    if buf.len() >= end + content_length {
2963                        break;
2964                    }
2965                }
2966            }
2967
2968            let end = header_end.expect("header terminator");
2969            let request = String::from_utf8_lossy(&buf[..end]).to_string();
2970            let body = String::from_utf8_lossy(&buf[end..end + content_length]).to_string();
2971            let mut lines = request.lines();
2972            let request_line = lines.next().expect("request line").to_string();
2973            let path = request_line
2974                .split_whitespace()
2975                .nth(1)
2976                .expect("request path")
2977                .to_string();
2978            let response_body = handler(request_line, path, body);
2979            let response = format!(
2980                "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
2981                response_body.len(),
2982                response_body
2983            );
2984            stream
2985                .write_all(response.as_bytes())
2986                .expect("write response");
2987        });
2988
2989        (format!("http://{}", addr), handle)
2990    }
2991
2992    fn test_vector_for_texts(texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
2993        Ok(texts.iter().map(|_| vec![1.0, 0.0, 0.0]).collect())
2994    }
2995
2996    fn write_rust_file(path: &Path, function_name: &str) {
2997        fs::write(
2998            path,
2999            format!("pub fn {function_name}() -> bool {{\n    true\n}}\n"),
3000        )
3001        .unwrap();
3002    }
3003
3004    fn build_test_index(project_root: &Path, files: &[PathBuf]) -> SemanticIndex {
3005        let mut embed = test_vector_for_texts;
3006        SemanticIndex::build(project_root, files, &mut embed, 8).unwrap()
3007    }
3008
3009    fn test_project_root() -> PathBuf {
3010        std::env::current_dir().unwrap()
3011    }
3012
3013    fn set_file_metadata(index: &mut SemanticIndex, file: &Path, mtime: SystemTime, size: u64) {
3014        index.file_mtimes.insert(file.to_path_buf(), mtime);
3015        index.file_sizes.insert(file.to_path_buf(), size);
3016        index
3017            .file_hashes
3018            .insert(file.to_path_buf(), cache_freshness::zero_hash());
3019    }
3020
3021    #[test]
3022    fn semantic_cache_serialization_skips_paths_outside_project_root() {
3023        let dir = tempfile::tempdir().expect("create temp dir");
3024        let project = fs::canonicalize(dir.path()).expect("canonical project");
3025        let outside = project.join("..").join("outside.rs");
3026        let mut index = SemanticIndex::new(project.clone(), 3);
3027        index
3028            .file_mtimes
3029            .insert(outside.clone(), SystemTime::UNIX_EPOCH);
3030        index.file_sizes.insert(outside.clone(), 1);
3031        index
3032            .file_hashes
3033            .insert(outside.clone(), cache_freshness::zero_hash());
3034        index.entries.push(EmbeddingEntry {
3035            chunk: SemanticChunk {
3036                file: outside,
3037                name: "outside".to_string(),
3038                kind: SymbolKind::Function,
3039                start_line: 0,
3040                end_line: 0,
3041                exported: false,
3042                embed_text: "outside".to_string(),
3043                snippet: "outside".to_string(),
3044            },
3045            vector: vec![1.0, 0.0, 0.0],
3046        });
3047
3048        let bytes = index.to_bytes();
3049        let loaded = SemanticIndex::from_bytes(&bytes, &project).expect("load serialized index");
3050        assert_eq!(loaded.entries.len(), 0);
3051        assert!(loaded.file_mtimes.is_empty());
3052    }
3053
3054    #[test]
3055    fn test_cosine_similarity_identical() {
3056        let a = vec![1.0, 0.0, 0.0];
3057        let b = vec![1.0, 0.0, 0.0];
3058        assert!((cosine_similarity(&a, &b) - 1.0).abs() < 0.001);
3059    }
3060
3061    #[test]
3062    fn test_cosine_similarity_orthogonal() {
3063        let a = vec![1.0, 0.0, 0.0];
3064        let b = vec![0.0, 1.0, 0.0];
3065        assert!(cosine_similarity(&a, &b).abs() < 0.001);
3066    }
3067
3068    #[test]
3069    fn test_cosine_similarity_opposite() {
3070        let a = vec![1.0, 0.0, 0.0];
3071        let b = vec![-1.0, 0.0, 0.0];
3072        assert!((cosine_similarity(&a, &b) + 1.0).abs() < 0.001);
3073    }
3074
3075    #[test]
3076    fn test_serialization_roundtrip() {
3077        let project_root = test_project_root();
3078        let file = project_root.join("src/main.rs");
3079        let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
3080        index.entries.push(EmbeddingEntry {
3081            chunk: SemanticChunk {
3082                file: file.clone(),
3083                name: "handle_request".to_string(),
3084                kind: SymbolKind::Function,
3085                start_line: 10,
3086                end_line: 25,
3087                exported: true,
3088                embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
3089                snippet: "fn handle_request() {\n  // ...\n}".to_string(),
3090            },
3091            vector: vec![0.1, 0.2, 0.3, 0.4],
3092        });
3093        index.dimension = 4;
3094        index
3095            .file_mtimes
3096            .insert(file.clone(), SystemTime::UNIX_EPOCH);
3097        index.file_sizes.insert(file, 0);
3098        index.set_fingerprint(SemanticIndexFingerprint {
3099            backend: "fastembed".to_string(),
3100            model: "all-MiniLM-L6-v2".to_string(),
3101            base_url: FALLBACK_BACKEND.to_string(),
3102            dimension: 4,
3103            chunking_version: default_chunking_version(),
3104        });
3105
3106        let bytes = index.to_bytes();
3107        let restored = SemanticIndex::from_bytes(&bytes, &project_root).unwrap();
3108
3109        assert_eq!(restored.entries.len(), 1);
3110        assert_eq!(restored.entries[0].chunk.name, "handle_request");
3111        assert_eq!(restored.entries[0].vector, vec![0.1, 0.2, 0.3, 0.4]);
3112        assert_eq!(restored.dimension, 4);
3113        assert_eq!(restored.backend_label(), Some("fastembed"));
3114        assert_eq!(restored.model_label(), Some("all-MiniLM-L6-v2"));
3115    }
3116
3117    #[test]
3118    fn symbol_kind_serialization_roundtrip_includes_file_summary_variant() {
3119        let cases = [
3120            (SymbolKind::Function, 0),
3121            (SymbolKind::Class, 1),
3122            (SymbolKind::Method, 2),
3123            (SymbolKind::Struct, 3),
3124            (SymbolKind::Interface, 4),
3125            (SymbolKind::Enum, 5),
3126            (SymbolKind::TypeAlias, 6),
3127            (SymbolKind::Variable, 7),
3128            (SymbolKind::Heading, 8),
3129            (SymbolKind::FileSummary, 9),
3130        ];
3131
3132        for (kind, encoded) in cases {
3133            assert_eq!(symbol_kind_to_u8(&kind), encoded);
3134            assert_eq!(u8_to_symbol_kind(encoded), kind);
3135        }
3136    }
3137
3138    #[test]
3139    fn test_search_top_k() {
3140        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3141        index.dimension = 3;
3142
3143        // Add entries with known vectors
3144        for (i, name) in ["auth", "database", "handler"].iter().enumerate() {
3145            let mut vec = vec![0.0f32; 3];
3146            vec[i] = 1.0; // orthogonal vectors
3147            index.entries.push(EmbeddingEntry {
3148                chunk: SemanticChunk {
3149                    file: PathBuf::from("/src/lib.rs"),
3150                    name: name.to_string(),
3151                    kind: SymbolKind::Function,
3152                    start_line: (i * 10 + 1) as u32,
3153                    end_line: (i * 10 + 5) as u32,
3154                    exported: true,
3155                    embed_text: format!("kind:function name:{}", name),
3156                    snippet: format!("fn {}() {{}}", name),
3157                },
3158                vector: vec,
3159            });
3160        }
3161
3162        // Query aligned with "auth" (index 0)
3163        let query = vec![0.9, 0.1, 0.0];
3164        let results = index.search(&query, 2);
3165
3166        assert_eq!(results.len(), 2);
3167        assert_eq!(results[0].name, "auth"); // highest score
3168        assert!(results[0].score > results[1].score);
3169    }
3170
3171    #[test]
3172    fn test_empty_index_search() {
3173        let index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3174        let results = index.search(&[0.1, 0.2, 0.3], 10);
3175        assert!(results.is_empty());
3176    }
3177
3178    #[test]
3179    fn single_line_symbol_builds_non_empty_snippet() {
3180        let symbol = Symbol {
3181            name: "answer".to_string(),
3182            kind: SymbolKind::Variable,
3183            range: crate::symbols::Range {
3184                start_line: 0,
3185                start_col: 0,
3186                end_line: 0,
3187                end_col: 24,
3188            },
3189            signature: Some("const answer = 42".to_string()),
3190            scope_chain: Vec::new(),
3191            exported: true,
3192            parent: None,
3193        };
3194        let source = "export const answer = 42;\n";
3195
3196        let snippet = build_snippet(&symbol, source);
3197
3198        assert_eq!(snippet, "export const answer = 42;");
3199    }
3200
3201    #[test]
3202    fn optimized_file_chunk_collection_matches_file_parser_path() {
3203        let project_root = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
3204        let file = project_root.join("src/semantic_index.rs");
3205        let source = std::fs::read_to_string(&file).unwrap();
3206
3207        let mut legacy_parser = FileParser::new();
3208        let legacy_symbols = legacy_parser.extract_symbols(&file).unwrap();
3209        let legacy_chunks = symbols_to_chunks(&file, &legacy_symbols, &source, &project_root);
3210
3211        let mut parsers = HashMap::new();
3212        let optimized_chunks = collect_file_chunks(&project_root, &file, &mut parsers).unwrap();
3213
3214        assert_eq!(
3215            chunk_fingerprint(&optimized_chunks),
3216            chunk_fingerprint(&legacy_chunks)
3217        );
3218    }
3219
3220    fn chunk_fingerprint(
3221        chunks: &[SemanticChunk],
3222    ) -> Vec<(String, SymbolKind, u32, u32, bool, String, String)> {
3223        chunks
3224            .iter()
3225            .map(|chunk| {
3226                (
3227                    chunk.name.clone(),
3228                    chunk.kind.clone(),
3229                    chunk.start_line,
3230                    chunk.end_line,
3231                    chunk.exported,
3232                    chunk.embed_text.clone(),
3233                    chunk.snippet.clone(),
3234                )
3235            })
3236            .collect()
3237    }
3238
3239    #[test]
3240    fn rejects_oversized_dimension_during_deserialization() {
3241        let mut bytes = Vec::new();
3242        bytes.push(1u8);
3243        bytes.extend_from_slice(&((MAX_DIMENSION as u32) + 1).to_le_bytes());
3244        bytes.extend_from_slice(&0u32.to_le_bytes());
3245        bytes.extend_from_slice(&0u32.to_le_bytes());
3246
3247        assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
3248    }
3249
3250    #[test]
3251    fn rejects_oversized_entry_count_during_deserialization() {
3252        let mut bytes = Vec::new();
3253        bytes.push(1u8);
3254        bytes.extend_from_slice(&(DEFAULT_DIMENSION as u32).to_le_bytes());
3255        bytes.extend_from_slice(&((MAX_ENTRIES as u32) + 1).to_le_bytes());
3256        bytes.extend_from_slice(&0u32.to_le_bytes());
3257
3258        assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
3259    }
3260
3261    #[test]
3262    fn invalidate_file_removes_entries_and_mtime() {
3263        let target = PathBuf::from("/src/main.rs");
3264        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3265        index.entries.push(EmbeddingEntry {
3266            chunk: SemanticChunk {
3267                file: target.clone(),
3268                name: "main".to_string(),
3269                kind: SymbolKind::Function,
3270                start_line: 0,
3271                end_line: 1,
3272                exported: false,
3273                embed_text: "main".to_string(),
3274                snippet: "fn main() {}".to_string(),
3275            },
3276            vector: vec![1.0; DEFAULT_DIMENSION],
3277        });
3278        index
3279            .file_mtimes
3280            .insert(target.clone(), SystemTime::UNIX_EPOCH);
3281        index.file_sizes.insert(target.clone(), 0);
3282
3283        index.invalidate_file(&target);
3284
3285        assert!(index.entries.is_empty());
3286        assert!(!index.file_mtimes.contains_key(&target));
3287        assert!(!index.file_sizes.contains_key(&target));
3288    }
3289
3290    #[test]
3291    fn refresh_missing_changed_file_is_purged_after_collect() {
3292        let temp = tempfile::tempdir().unwrap();
3293        let project_root = temp.path();
3294        let file = project_root.join("src/lib.rs");
3295        fs::create_dir_all(file.parent().unwrap()).unwrap();
3296        write_rust_file(&file, "vanished_symbol");
3297
3298        let mut index = build_test_index(project_root, std::slice::from_ref(&file));
3299        let original_size = *index.file_sizes.get(&file).unwrap();
3300        set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, original_size + 1);
3301        fs::remove_file(&file).unwrap();
3302
3303        let mut embed = test_vector_for_texts;
3304        let mut progress = |_done: usize, _total: usize| {};
3305        let summary = index
3306            .refresh_stale_files(
3307                project_root,
3308                std::slice::from_ref(&file),
3309                &mut embed,
3310                8,
3311                &mut progress,
3312            )
3313            .unwrap();
3314
3315        assert_eq!(summary.changed, 0);
3316        assert_eq!(summary.added, 0);
3317        assert_eq!(summary.deleted, 1);
3318        assert!(index.entries.is_empty());
3319        assert!(!index.file_mtimes.contains_key(&file));
3320        assert!(!index.file_sizes.contains_key(&file));
3321        assert!(!index.file_hashes.contains_key(&file));
3322    }
3323
3324    #[test]
3325    fn refresh_collect_error_for_existing_path_preserves_cached_entry() {
3326        let temp = tempfile::tempdir().unwrap();
3327        let project_root = temp.path();
3328        let file = project_root.join("src/lib.rs");
3329        fs::create_dir_all(file.parent().unwrap()).unwrap();
3330        write_rust_file(&file, "kept_symbol");
3331
3332        let mut index = build_test_index(project_root, std::slice::from_ref(&file));
3333        let original_entry_count = index.entries.len();
3334        let original_mtime = *index.file_mtimes.get(&file).unwrap();
3335        let original_size = *index.file_sizes.get(&file).unwrap();
3336
3337        let stale_mtime = SystemTime::UNIX_EPOCH;
3338        set_file_metadata(&mut index, &file, stale_mtime, original_size + 1);
3339        fs::remove_file(&file).unwrap();
3340        fs::create_dir(&file).unwrap();
3341
3342        let mut embed = test_vector_for_texts;
3343        let mut progress = |_done: usize, _total: usize| {};
3344        let summary = index
3345            .refresh_stale_files(
3346                project_root,
3347                std::slice::from_ref(&file),
3348                &mut embed,
3349                8,
3350                &mut progress,
3351            )
3352            .unwrap();
3353
3354        assert_eq!(summary.changed, 0);
3355        assert_eq!(summary.added, 0);
3356        assert_eq!(summary.deleted, 0);
3357        assert_eq!(index.entries.len(), original_entry_count);
3358        assert!(index
3359            .entries
3360            .iter()
3361            .any(|entry| entry.chunk.name == "kept_symbol"));
3362        assert_eq!(index.file_mtimes.get(&file), Some(&stale_mtime));
3363        assert_ne!(index.file_mtimes.get(&file), Some(&original_mtime));
3364        assert_eq!(index.file_sizes.get(&file), Some(&(original_size + 1)));
3365    }
3366
3367    #[test]
3368    fn refresh_never_indexed_file_error_does_not_record_mtime() {
3369        let temp = tempfile::tempdir().unwrap();
3370        let project_root = temp.path();
3371        let missing = project_root.join("src/missing.rs");
3372        fs::create_dir_all(missing.parent().unwrap()).unwrap();
3373
3374        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3375        let mut embed = test_vector_for_texts;
3376        let mut progress = |_done: usize, _total: usize| {};
3377        let summary = index
3378            .refresh_stale_files(
3379                project_root,
3380                std::slice::from_ref(&missing),
3381                &mut embed,
3382                8,
3383                &mut progress,
3384            )
3385            .unwrap();
3386
3387        assert_eq!(summary.added, 0);
3388        assert_eq!(summary.changed, 0);
3389        assert_eq!(summary.deleted, 0);
3390        assert!(!index.file_mtimes.contains_key(&missing));
3391        assert!(!index.file_sizes.contains_key(&missing));
3392        assert!(index.entries.is_empty());
3393    }
3394
3395    #[test]
3396    fn refresh_reports_added_for_new_files() {
3397        let temp = tempfile::tempdir().unwrap();
3398        let project_root = temp.path();
3399        let existing = project_root.join("src/lib.rs");
3400        let added = project_root.join("src/new.rs");
3401        fs::create_dir_all(existing.parent().unwrap()).unwrap();
3402        write_rust_file(&existing, "existing_symbol");
3403        write_rust_file(&added, "added_symbol");
3404
3405        let mut index = build_test_index(project_root, std::slice::from_ref(&existing));
3406        let mut embed = test_vector_for_texts;
3407        let mut progress = |_done: usize, _total: usize| {};
3408        let summary = index
3409            .refresh_stale_files(
3410                project_root,
3411                &[existing.clone(), added.clone()],
3412                &mut embed,
3413                8,
3414                &mut progress,
3415            )
3416            .unwrap();
3417
3418        assert_eq!(summary.added, 1);
3419        assert_eq!(summary.changed, 0);
3420        assert_eq!(summary.deleted, 0);
3421        assert_eq!(summary.total_processed, 2);
3422        assert!(index.file_mtimes.contains_key(&added));
3423        assert!(index.entries.iter().any(|entry| entry.chunk.file == added));
3424    }
3425
3426    #[test]
3427    fn refresh_reports_deleted_for_removed_files() {
3428        let temp = tempfile::tempdir().unwrap();
3429        let project_root = temp.path();
3430        let deleted = project_root.join("src/deleted.rs");
3431        fs::create_dir_all(deleted.parent().unwrap()).unwrap();
3432        write_rust_file(&deleted, "deleted_symbol");
3433
3434        let mut index = build_test_index(project_root, std::slice::from_ref(&deleted));
3435        fs::remove_file(&deleted).unwrap();
3436
3437        let mut embed = test_vector_for_texts;
3438        let mut progress = |_done: usize, _total: usize| {};
3439        let summary = index
3440            .refresh_stale_files(project_root, &[], &mut embed, 8, &mut progress)
3441            .unwrap();
3442
3443        assert_eq!(summary.deleted, 1);
3444        assert_eq!(summary.changed, 0);
3445        assert_eq!(summary.added, 0);
3446        assert_eq!(summary.total_processed, 1);
3447        assert!(!index.file_mtimes.contains_key(&deleted));
3448        assert!(index.entries.is_empty());
3449    }
3450
3451    #[test]
3452    fn refresh_reports_changed_for_modified_files() {
3453        let temp = tempfile::tempdir().unwrap();
3454        let project_root = temp.path();
3455        let file = project_root.join("src/lib.rs");
3456        fs::create_dir_all(file.parent().unwrap()).unwrap();
3457        write_rust_file(&file, "old_symbol");
3458
3459        let mut index = build_test_index(project_root, std::slice::from_ref(&file));
3460        set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, 0);
3461        write_rust_file(&file, "new_symbol");
3462
3463        let mut embed = test_vector_for_texts;
3464        let mut progress = |_done: usize, _total: usize| {};
3465        let summary = index
3466            .refresh_stale_files(
3467                project_root,
3468                std::slice::from_ref(&file),
3469                &mut embed,
3470                8,
3471                &mut progress,
3472            )
3473            .unwrap();
3474
3475        assert_eq!(summary.changed, 1);
3476        assert_eq!(summary.added, 0);
3477        assert_eq!(summary.deleted, 0);
3478        assert_eq!(summary.total_processed, 1);
3479        assert!(index
3480            .entries
3481            .iter()
3482            .any(|entry| entry.chunk.name == "new_symbol"));
3483        assert!(!index
3484            .entries
3485            .iter()
3486            .any(|entry| entry.chunk.name == "old_symbol"));
3487    }
3488
3489    #[test]
3490    fn refresh_all_clean_reports_zero_counts_and_no_embedding_work() {
3491        let temp = tempfile::tempdir().unwrap();
3492        let project_root = temp.path();
3493        let file = project_root.join("src/lib.rs");
3494        fs::create_dir_all(file.parent().unwrap()).unwrap();
3495        write_rust_file(&file, "clean_symbol");
3496
3497        let mut index = build_test_index(project_root, std::slice::from_ref(&file));
3498        let original_entries = index.entries.len();
3499        let mut embed_called = false;
3500        let mut embed = |texts: Vec<String>| {
3501            embed_called = true;
3502            test_vector_for_texts(texts)
3503        };
3504        let mut progress = |_done: usize, _total: usize| {};
3505        let summary = index
3506            .refresh_stale_files(
3507                project_root,
3508                std::slice::from_ref(&file),
3509                &mut embed,
3510                8,
3511                &mut progress,
3512            )
3513            .unwrap();
3514
3515        assert!(summary.is_noop());
3516        assert_eq!(summary.total_processed, 1);
3517        assert!(!embed_called);
3518        assert_eq!(index.entries.len(), original_entries);
3519    }
3520
3521    #[test]
3522    fn detects_missing_onnx_runtime_from_dynamic_load_error() {
3523        let message = "Failed to load ONNX Runtime shared library libonnxruntime.dylib via dlopen: no such file";
3524
3525        assert!(is_onnx_runtime_unavailable(message));
3526    }
3527
3528    #[test]
3529    fn formats_missing_onnx_runtime_with_install_hint() {
3530        let message = format_embedding_init_error(
3531            "Failed to load ONNX Runtime shared library libonnxruntime.so via dlopen: no such file",
3532        );
3533
3534        assert!(message.starts_with("ONNX Runtime not found. Install via:"));
3535        assert!(message.contains("Original error:"));
3536    }
3537
3538    #[test]
3539    fn openai_compatible_backend_embeds_with_mock_server() {
3540        let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
3541            assert!(request_line.starts_with("POST "));
3542            assert_eq!(path, "/v1/embeddings");
3543            "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0},{\"embedding\":[0.4,0.5,0.6],\"index\":1}]}".to_string()
3544        });
3545
3546        let config = SemanticBackendConfig {
3547            backend: SemanticBackend::OpenAiCompatible,
3548            model: "test-embedding".to_string(),
3549            base_url: Some(base_url),
3550            api_key_env: None,
3551            timeout_ms: 5_000,
3552            max_batch_size: 64,
3553            max_files: 20_000,
3554        };
3555
3556        let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
3557        let vectors = model
3558            .embed(vec!["hello".to_string(), "world".to_string()])
3559            .unwrap();
3560
3561        assert_eq!(vectors, vec![vec![0.1, 0.2, 0.3], vec![0.4, 0.5, 0.6]]);
3562        handle.join().unwrap();
3563    }
3564
3565    /// Regression for issue #36: AFT was sending TWO Content-Type headers
3566    /// on the OpenAI embeddings request — once implicitly via `.json(&body)`
3567    /// and again explicitly via `.header("Content-Type", "application/json")`.
3568    /// reqwest's `.header()` calls `HeaderMap::append`, which produces two
3569    /// headers on the wire. OpenAI's /v1/embeddings endpoint rejects that
3570    /// with `HTTP 400 "you must provide a model parameter"` even though the
3571    /// body actually contains `model`. The fix is to drop the explicit
3572    /// `.header("Content-Type", ...)` call. This test pins that we send
3573    /// exactly one Content-Type header.
3574    #[test]
3575    fn openai_compatible_request_has_single_content_type_header() {
3576        use std::sync::{Arc, Mutex};
3577        let captured: Arc<Mutex<Vec<u8>>> = Arc::new(Mutex::new(Vec::new()));
3578        let captured_for_thread = Arc::clone(&captured);
3579
3580        let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
3581        let addr = listener.local_addr().expect("local addr");
3582        let handle = thread::spawn(move || {
3583            let (mut stream, _) = listener.accept().expect("accept");
3584            let mut buf = Vec::new();
3585            let mut chunk = [0u8; 4096];
3586            let mut header_end = None;
3587            let mut content_length = 0usize;
3588            loop {
3589                let n = stream.read(&mut chunk).expect("read");
3590                if n == 0 {
3591                    break;
3592                }
3593                buf.extend_from_slice(&chunk[..n]);
3594                if header_end.is_none() {
3595                    if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
3596                        header_end = Some(pos + 4);
3597                        for line in String::from_utf8_lossy(&buf[..pos + 4]).lines() {
3598                            if let Some(value) = line.strip_prefix("Content-Length:") {
3599                                content_length = value.trim().parse::<usize>().unwrap_or(0);
3600                            }
3601                        }
3602                    }
3603                }
3604                if let Some(end) = header_end {
3605                    if buf.len() >= end + content_length {
3606                        break;
3607                    }
3608                }
3609            }
3610            *captured_for_thread.lock().unwrap() = buf;
3611            let body = "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0}]}";
3612            let response = format!(
3613                "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
3614                body.len(),
3615                body
3616            );
3617            let _ = stream.write_all(response.as_bytes());
3618        });
3619
3620        let config = SemanticBackendConfig {
3621            backend: SemanticBackend::OpenAiCompatible,
3622            model: "text-embedding-3-small".to_string(),
3623            base_url: Some(format!("http://{}", addr)),
3624            api_key_env: None,
3625            timeout_ms: 5_000,
3626            max_batch_size: 64,
3627            max_files: 20_000,
3628        };
3629        let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
3630        let _ = model.embed(vec!["probe".to_string()]).unwrap();
3631        handle.join().unwrap();
3632
3633        let bytes = captured.lock().unwrap().clone();
3634        let request = String::from_utf8_lossy(&bytes);
3635
3636        // Lowercase line counts because HTTP headers are case-insensitive
3637        // and reqwest may emit `content-type` in lowercase under HTTP/2.
3638        let content_type_lines = request
3639            .lines()
3640            .filter(|line| {
3641                let lower = line.to_ascii_lowercase();
3642                lower.starts_with("content-type:")
3643            })
3644            .count();
3645        assert_eq!(
3646            content_type_lines, 1,
3647            "expected exactly one Content-Type header but found {content_type_lines}; full request:\n{request}",
3648        );
3649
3650        // The body must still include the model field — pin this so a future
3651        // change can't accidentally drop `model` while fixing duplicate headers.
3652        assert!(
3653            request.contains(r#""model":"text-embedding-3-small""#),
3654            "request body should contain model field; full request:\n{request}",
3655        );
3656    }
3657
3658    #[test]
3659    fn ollama_backend_embeds_with_mock_server() {
3660        let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
3661            assert!(request_line.starts_with("POST "));
3662            assert_eq!(path, "/api/embed");
3663            "{\"embeddings\":[[0.7,0.8,0.9],[1.0,1.1,1.2]]}".to_string()
3664        });
3665
3666        let config = SemanticBackendConfig {
3667            backend: SemanticBackend::Ollama,
3668            model: "embeddinggemma".to_string(),
3669            base_url: Some(base_url),
3670            api_key_env: None,
3671            timeout_ms: 5_000,
3672            max_batch_size: 64,
3673            max_files: 20_000,
3674        };
3675
3676        let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
3677        let vectors = model
3678            .embed(vec!["hello".to_string(), "world".to_string()])
3679            .unwrap();
3680
3681        assert_eq!(vectors, vec![vec![0.7, 0.8, 0.9], vec![1.0, 1.1, 1.2]]);
3682        handle.join().unwrap();
3683    }
3684
3685    #[test]
3686    fn read_from_disk_rejects_fingerprint_mismatch() {
3687        let storage = tempfile::tempdir().unwrap();
3688        let project_key = "proj";
3689
3690        let project_root = test_project_root();
3691        let file = project_root.join("src/main.rs");
3692        let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
3693        index.entries.push(EmbeddingEntry {
3694            chunk: SemanticChunk {
3695                file: file.clone(),
3696                name: "handle_request".to_string(),
3697                kind: SymbolKind::Function,
3698                start_line: 10,
3699                end_line: 25,
3700                exported: true,
3701                embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
3702                snippet: "fn handle_request() {}".to_string(),
3703            },
3704            vector: vec![0.1, 0.2, 0.3],
3705        });
3706        index.dimension = 3;
3707        index
3708            .file_mtimes
3709            .insert(file.clone(), SystemTime::UNIX_EPOCH);
3710        index.file_sizes.insert(file, 0);
3711        index.set_fingerprint(SemanticIndexFingerprint {
3712            backend: "openai_compatible".to_string(),
3713            model: "test-embedding".to_string(),
3714            base_url: "http://127.0.0.1:1234/v1".to_string(),
3715            dimension: 3,
3716            chunking_version: default_chunking_version(),
3717        });
3718        index.write_to_disk(storage.path(), project_key);
3719
3720        let matching = index.fingerprint().unwrap().as_string();
3721        assert!(SemanticIndex::read_from_disk(
3722            storage.path(),
3723            project_key,
3724            &project_root,
3725            false,
3726            Some(&matching),
3727        )
3728        .is_some());
3729
3730        let mismatched = SemanticIndexFingerprint {
3731            backend: "ollama".to_string(),
3732            model: "embeddinggemma".to_string(),
3733            base_url: "http://127.0.0.1:11434".to_string(),
3734            dimension: 3,
3735            chunking_version: default_chunking_version(),
3736        }
3737        .as_string();
3738        assert!(SemanticIndex::read_from_disk(
3739            storage.path(),
3740            project_key,
3741            &project_root,
3742            false,
3743            Some(&mismatched),
3744        )
3745        .is_none());
3746    }
3747
3748    #[test]
3749    fn read_from_disk_rejects_v3_cache_for_snippet_rebuild() {
3750        let storage = tempfile::tempdir().unwrap();
3751        let project_key = "proj-v3";
3752        let dir = storage.path().join("semantic").join(project_key);
3753        fs::create_dir_all(&dir).unwrap();
3754
3755        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3756        index.entries.push(EmbeddingEntry {
3757            chunk: SemanticChunk {
3758                file: PathBuf::from("/src/main.rs"),
3759                name: "handle_request".to_string(),
3760                kind: SymbolKind::Function,
3761                start_line: 0,
3762                end_line: 0,
3763                exported: true,
3764                embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
3765                snippet: "fn handle_request() {}".to_string(),
3766            },
3767            vector: vec![0.1, 0.2, 0.3],
3768        });
3769        index.dimension = 3;
3770        index
3771            .file_mtimes
3772            .insert(PathBuf::from("/src/main.rs"), SystemTime::UNIX_EPOCH);
3773        index.file_sizes.insert(PathBuf::from("/src/main.rs"), 0);
3774        let fingerprint = SemanticIndexFingerprint {
3775            backend: "fastembed".to_string(),
3776            model: "test".to_string(),
3777            base_url: FALLBACK_BACKEND.to_string(),
3778            dimension: 3,
3779            chunking_version: default_chunking_version(),
3780        };
3781        index.set_fingerprint(fingerprint.clone());
3782
3783        let mut bytes = index.to_bytes();
3784        bytes[0] = SEMANTIC_INDEX_VERSION_V3;
3785        fs::write(dir.join("semantic.bin"), bytes).unwrap();
3786
3787        assert!(SemanticIndex::read_from_disk(
3788            storage.path(),
3789            project_key,
3790            &test_project_root(),
3791            false,
3792            Some(&fingerprint.as_string())
3793        )
3794        .is_none());
3795        assert!(!dir.join("semantic.bin").exists());
3796    }
3797
3798    fn make_symbol(kind: SymbolKind, name: &str, start: u32, end: u32) -> crate::symbols::Symbol {
3799        crate::symbols::Symbol {
3800            name: name.to_string(),
3801            kind,
3802            range: crate::symbols::Range {
3803                start_line: start,
3804                start_col: 0,
3805                end_line: end,
3806                end_col: 0,
3807            },
3808            signature: None,
3809            scope_chain: Vec::new(),
3810            exported: false,
3811            parent: None,
3812        }
3813    }
3814
3815    /// Heading symbols (Markdown / HTML headings) must NOT be indexed —
3816    /// they overwhelmingly dominated semantic results even on code-shaped
3817    /// queries because heading prose embeds far more strongly than code
3818    /// chunks. Skipping headings keeps aft_search a code-finder.
3819    #[test]
3820    fn symbols_to_chunks_skips_heading_symbols() {
3821        let project_root = PathBuf::from("/proj");
3822        let file = project_root.join("README.md");
3823        let source = "# Title\n\nbody text\n\n## Section\n\nmore text\n";
3824
3825        let symbols = vec![
3826            make_symbol(SymbolKind::Heading, "Title", 0, 2),
3827            make_symbol(SymbolKind::Heading, "Section", 4, 6),
3828        ];
3829
3830        let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
3831        assert!(
3832            chunks.is_empty(),
3833            "Heading symbols must be filtered out before embedding; got {} chunk(s)",
3834            chunks.len()
3835        );
3836    }
3837
3838    /// Code symbols (functions, classes, methods, structs, etc.) must still
3839    /// be indexed alongside the heading skip — otherwise we'd starve the
3840    /// index entirely.
3841    #[test]
3842    fn symbols_to_chunks_keeps_code_symbols_alongside_skipped_headings() {
3843        let project_root = PathBuf::from("/proj");
3844        let file = project_root.join("src/lib.rs");
3845        let source = "pub fn handle_request() -> bool {\n    true\n}\n";
3846
3847        let symbols = vec![
3848            // A heading mixed in (e.g. from a doc comment block elsewhere).
3849            make_symbol(SymbolKind::Heading, "doc heading", 0, 1),
3850            make_symbol(SymbolKind::Function, "handle_request", 0, 2),
3851            make_symbol(SymbolKind::Struct, "AuthService", 4, 6),
3852        ];
3853
3854        let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
3855        assert_eq!(
3856            chunks.len(),
3857            3,
3858            "Expected file-summary + 2 code chunks (Function + Struct), got {}",
3859            chunks.len()
3860        );
3861        let names: Vec<&str> = chunks.iter().map(|c| c.name.as_str()).collect();
3862        assert!(chunks
3863            .iter()
3864            .any(|chunk| matches!(chunk.kind, SymbolKind::FileSummary)));
3865        assert!(names.contains(&"handle_request"));
3866        assert!(names.contains(&"AuthService"));
3867        assert!(
3868            !names.contains(&"doc heading"),
3869            "Heading symbol leaked into chunks: {names:?}"
3870        );
3871    }
3872
3873    #[test]
3874    fn validate_ssrf_allows_loopback_hostnames() {
3875        // Loopback hostnames are explicitly allowed so self-hosted backends
3876        // (Ollama at http://localhost:11434) work at their default config.
3877        for host in &[
3878            "http://localhost",
3879            "http://localhost:8080",
3880            "http://localhost:11434", // Ollama default
3881            "http://localhost.localdomain",
3882            "http://foo.localhost",
3883        ] {
3884            assert!(
3885                validate_base_url_no_ssrf(host).is_ok(),
3886                "Expected {host} to be allowed (loopback), got: {:?}",
3887                validate_base_url_no_ssrf(host)
3888            );
3889        }
3890    }
3891
3892    #[test]
3893    fn validate_ssrf_allows_loopback_ips() {
3894        // 127.0.0.0/8 is loopback — by definition same-machine and not an
3895        // SSRF target. Allow it so Ollama at http://127.0.0.1:11434 works.
3896        for url in &[
3897            "http://127.0.0.1",
3898            "http://127.0.0.1:11434", // Ollama default
3899            "http://127.0.0.1:8080",
3900            "http://127.1.2.3",
3901        ] {
3902            let result = validate_base_url_no_ssrf(url);
3903            assert!(
3904                result.is_ok(),
3905                "Expected {url} to be allowed (loopback), got: {:?}",
3906                result
3907            );
3908        }
3909    }
3910
3911    #[test]
3912    fn validate_ssrf_rejects_private_non_loopback_ips() {
3913        // Non-loopback private/reserved IPs remain rejected — homelab/intranet
3914        // services on LAN IPs are real SSRF targets even though the user
3915        // configured them. Users who want this can opt in by binding the
3916        // service to a public-routable address.
3917        for url in &[
3918            "http://192.168.1.1",
3919            "http://10.0.0.1",
3920            "http://172.16.0.1",
3921            "http://169.254.169.254",
3922            "http://100.64.0.1",
3923        ] {
3924            let result = validate_base_url_no_ssrf(url);
3925            assert!(
3926                result.is_err(),
3927                "Expected {url} to be rejected (non-loopback private), got: {:?}",
3928                result
3929            );
3930        }
3931    }
3932
3933    #[test]
3934    fn validate_ssrf_rejects_mdns_local_hostnames() {
3935        // mDNS .local hostnames typically resolve to LAN devices, not
3936        // loopback. Rejecting them before DNS lookup gives a clearer error.
3937        for host in &[
3938            "http://printer.local",
3939            "http://nas.local:8080",
3940            "http://homelab.local",
3941        ] {
3942            let result = validate_base_url_no_ssrf(host);
3943            assert!(
3944                result.is_err(),
3945                "Expected {host} to be rejected (mDNS), got: {:?}",
3946                result
3947            );
3948        }
3949    }
3950
3951    #[test]
3952    fn normalize_base_url_allows_localhost_for_tests() {
3953        // normalize_base_url itself should NOT block localhost — only
3954        // validate_base_url_no_ssrf does. Tests construct backends directly.
3955        assert!(normalize_base_url("http://127.0.0.1:9999").is_ok());
3956        assert!(normalize_base_url("http://localhost:8080").is_ok());
3957    }
3958
3959    /// Pin the user-facing wording of the ONNX version-mismatch error.
3960    /// The auto-fix path MUST be listed first because it's the only safe
3961    /// option that doesn't require sudo or risk breaking other apps that
3962    /// link the system library. Regression of any of these strings would
3963    /// either mislead users (system rm before auto-fix) or break the
3964    /// `aft doctor --fix` discovery path.
3965    #[test]
3966    fn ort_mismatch_message_recommends_auto_fix_first() {
3967        let msg =
3968            format_ort_version_mismatch("1.9.0", "/usr/lib/x86_64-linux-gnu/libonnxruntime.so");
3969
3970        // The reported version and path must appear verbatim.
3971        assert!(
3972            msg.contains("v1.9.0"),
3973            "should report detected version: {msg}"
3974        );
3975        assert!(
3976            msg.contains("/usr/lib/x86_64-linux-gnu/libonnxruntime.so"),
3977            "should report system path: {msg}"
3978        );
3979        assert!(msg.contains("v1.20+"), "should state requirement: {msg}");
3980
3981        // Solution ordering: auto-fix is #1, system rm is #2, install is #3.
3982        let auto_fix_pos = msg
3983            .find("Auto-fix")
3984            .expect("Auto-fix solution missing — users won't discover --fix");
3985        let remove_pos = msg
3986            .find("Remove the old library")
3987            .expect("system-rm solution missing");
3988        assert!(
3989            auto_fix_pos < remove_pos,
3990            "Auto-fix must come before manual rm — see PR comment thread"
3991        );
3992
3993        // The auto-fix command must be runnable as-is on a fresh system.
3994        assert!(
3995            msg.contains("npx @cortexkit/aft doctor --fix"),
3996            "auto-fix command must be present and copy-pasteable: {msg}"
3997        );
3998    }
3999
4000    /// macOS dylib paths must not produce a malformed message when the
4001    /// system path lacks a trailing slash. This is a regression guard
4002    /// for the "{}\n{}" format string contract.
4003    #[test]
4004    fn ort_mismatch_message_handles_macos_dylib_path() {
4005        let msg = format_ort_version_mismatch("1.9.0", "/opt/homebrew/lib/libonnxruntime.dylib");
4006        assert!(msg.contains("v1.9.0"));
4007        assert!(msg.contains("/opt/homebrew/lib/libonnxruntime.dylib"));
4008        // The dylib path must appear in the auto-fix paragraph (single
4009        // quotes around it) AND in the manual-rm paragraph; verify
4010        // both placements survived the format string.
4011        assert!(
4012            msg.contains("'/opt/homebrew/lib/libonnxruntime.dylib'"),
4013            "system path should be quoted in the auto-fix sentence: {msg}"
4014        );
4015    }
4016}
aft/semantic_index.rs

aft/
semantic_index.rs