Skip to main content

aft/
semantic_index.rs

1use crate::cache_freshness::{self, FileFreshness, FreshnessVerdict};
2use crate::config::{SemanticBackend, SemanticBackendConfig};
3use crate::fs_lock;
4use crate::parser::{detect_language, extract_symbols_from_tree, grammar_for};
5use crate::search_index::{cache_relative_path, cached_path_under_root};
6use crate::symbols::{Symbol, SymbolKind};
7use crate::{slog_info, slog_warn};
8
9use crate::local_embed::LocalEmbedder;
10use rayon::prelude::*;
11use reqwest::blocking::Client;
12use serde::{Deserialize, Serialize};
13use std::collections::{HashMap, HashSet, VecDeque};
14use std::env;
15use std::fmt::Display;
16use std::fs;
17use std::path::{Path, PathBuf};
18use std::sync::Mutex;
19use std::time::Duration;
20use std::time::SystemTime;
21use tree_sitter::Parser;
22use url::Url;
23
24const DEFAULT_DIMENSION: usize = 384;
25const MAX_ENTRIES: usize = 1_000_000;
26// Covers high-dimensional backends such as OpenAI text-embedding-3-large (3072)
27// and common local models (4096) while keeping a bounded supported shape.
28const MAX_DIMENSION: usize = 4096;
29const F32_BYTES: usize = std::mem::size_of::<f32>();
30const HEADER_BYTES_V1: usize = 9;
31const HEADER_BYTES_V2: usize = 13;
32const ONNX_RUNTIME_INSTALL_HINT: &str =
33    "ONNX Runtime not found. Install via: brew install onnxruntime (macOS), \
34     apt install libonnxruntime (Linux), or place onnxruntime.dll in your PATH (Windows). \
35     AFT can auto-download ONNX Runtime — run `npx @cortexkit/aft doctor` to diagnose.";
36
37const SEMANTIC_INDEX_VERSION_V1: u8 = 1;
38const SEMANTIC_INDEX_VERSION_V2: u8 = 2;
39/// V3 adds subsec_nanos to the file-mtime table so staleness detection survives
40/// restart round-trips on filesystems with subsecond mtime precision (APFS,
41/// ext4 with nsec, NTFS). V1/V2 persisted whole-second mtimes only, which
42/// caused every restart to flag ~99% of files as stale and re-embed them.
43const SEMANTIC_INDEX_VERSION_V3: u8 = 3;
44/// V4 keeps the V3 on-disk layout but rebuilds persisted snippets once after
45/// fixing symbol ranges that were incorrectly treated as 1-based.
46const SEMANTIC_INDEX_VERSION_V4: u8 = 4;
47/// V5 adds file sizes to the file metadata table so incremental staleness
48/// detection can catch content changes even when mtime precision misses them.
49const SEMANTIC_INDEX_VERSION_V5: u8 = 5;
50/// V6 stores paths relative to project_root and adds content hashes.
51const SEMANTIC_INDEX_VERSION_V6: u8 = 6;
52const DEFAULT_OPENAI_EMBEDDING_PATH: &str = "/embeddings";
53const DEFAULT_OLLAMA_EMBEDDING_PATH: &str = "/api/embed";
54// Must stay below the bridge timeout (30s) to avoid bridge kills on slow backends.
55const DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS: u64 = 25_000;
56const DEFAULT_MAX_BATCH_SIZE: usize = 64;
57const QUERY_EMBEDDING_CACHE_CAP: usize = 1_000;
58const FALLBACK_BACKEND: &str = "none";
59const EMBEDDING_REQUEST_MAX_ATTEMPTS: usize = 3;
60const EMBEDDING_REQUEST_BACKOFF_MS: [u64; 2] = [500, 1_000];
61static SEMANTIC_LOCK_ACQUIRE_MUTEX: Mutex<()> = Mutex::new(());
62
63pub struct SemanticIndexLock {
64    _guard: fs_lock::LockGuard,
65}
66
67impl SemanticIndexLock {
68    pub fn acquire(storage_dir: &Path, project_key: &str) -> std::io::Result<Self> {
69        let dir = storage_dir.join("semantic").join(project_key);
70        fs::create_dir_all(&dir)?;
71        let path = dir.join("cache.lock");
72        let _acquire_guard = SEMANTIC_LOCK_ACQUIRE_MUTEX
73            .lock()
74            .map_err(|_| std::io::Error::other("semantic cache lock acquisition mutex poisoned"))?;
75        fs_lock::try_acquire(&path, Duration::from_secs(2))
76            .map(|guard| Self { _guard: guard })
77            .map_err(|error| match error {
78                fs_lock::AcquireError::Timeout => {
79                    std::io::Error::other("timed out acquiring semantic cache lock")
80                }
81                fs_lock::AcquireError::Io(error) => error,
82            })
83    }
84}
85
86#[derive(Debug, Clone, Serialize, Deserialize)]
87pub struct SemanticIndexFingerprint {
88    pub backend: String,
89    pub model: String,
90    #[serde(default)]
91    pub base_url: String,
92    pub dimension: usize,
93    #[serde(default = "default_chunking_version")]
94    pub chunking_version: u32,
95}
96
97fn default_chunking_version() -> u32 {
98    2
99}
100
101impl SemanticIndexFingerprint {
102    fn from_config(config: &SemanticBackendConfig, dimension: usize) -> Self {
103        // Use normalized URL for fingerprinting so cosmetic differences
104        // (e.g. "http://host/v1" vs "http://host/v1/") don't cause rebuilds.
105        let base_url = config
106            .base_url
107            .as_ref()
108            .and_then(|u| normalize_base_url(u).ok())
109            .unwrap_or_else(|| FALLBACK_BACKEND.to_string());
110        Self {
111            backend: config.backend.as_str().to_string(),
112            model: config.model.clone(),
113            base_url,
114            dimension,
115            chunking_version: default_chunking_version(),
116        }
117    }
118
119    pub fn as_string(&self) -> String {
120        serde_json::to_string(self).unwrap_or_else(|_| String::new())
121    }
122
123    fn matches_expected(&self, expected: &str) -> bool {
124        let encoded = self.as_string();
125        !encoded.is_empty() && encoded == expected
126    }
127}
128
129enum SemanticEmbeddingEngine {
130    /// Local ONNX embedder (all-MiniLM-L6-v2 via raw `ort`). The config-facing
131    /// backend string stays "fastembed" for index-fingerprint compatibility.
132    Local(LocalEmbedder),
133    OpenAiCompatible {
134        client: Client,
135        model: String,
136        base_url: String,
137        api_key: Option<String>,
138    },
139    Ollama {
140        client: Client,
141        model: String,
142        base_url: String,
143    },
144}
145
146pub struct SemanticEmbeddingModel {
147    backend: SemanticBackend,
148    model: String,
149    base_url: Option<String>,
150    timeout_ms: u64,
151    max_batch_size: usize,
152    dimension: Option<usize>,
153    engine: SemanticEmbeddingEngine,
154    query_embedding_cache: HashMap<String, Vec<f32>>,
155    query_embedding_cache_order: VecDeque<String>,
156    query_embedding_cache_hits: u64,
157    query_embedding_cache_misses: u64,
158}
159
160pub type EmbeddingModel = SemanticEmbeddingModel;
161
162fn validate_embedding_batch(
163    vectors: &[Vec<f32>],
164    expected_count: usize,
165    context: &str,
166) -> Result<(), String> {
167    if expected_count > 0 && vectors.is_empty() {
168        return Err(format!(
169            "{context} returned no vectors for {expected_count} inputs"
170        ));
171    }
172
173    if vectors.len() != expected_count {
174        return Err(format!(
175            "{context} returned {} vectors for {} inputs",
176            vectors.len(),
177            expected_count
178        ));
179    }
180
181    let Some(first_vector) = vectors.first() else {
182        return Ok(());
183    };
184    let expected_dimension = first_vector.len();
185    validate_embedding_dimension(expected_dimension)
186        .map_err(|error| format!("{context} returned {error}"))?;
187    for (index, vector) in vectors.iter().enumerate() {
188        if vector.len() != expected_dimension {
189            return Err(format!(
190                "{context} returned inconsistent embedding dimensions: vector 0 has length {expected_dimension}, vector {index} has length {}",
191                vector.len()
192            ));
193        }
194    }
195
196    Ok(())
197}
198
199fn validate_embedding_dimension(dimension: usize) -> Result<(), String> {
200    if dimension == 0 || dimension > MAX_DIMENSION {
201        return Err(format!(
202            "invalid embedding dimension: {dimension}; supported range is 1..={MAX_DIMENSION}"
203        ));
204    }
205
206    Ok(())
207}
208
209/// Normalize a base URL: validate scheme and strip trailing slash.
210/// Does NOT perform SSRF/private-IP validation — call
211/// `validate_base_url_no_ssrf` separately when processing user-supplied config.
212fn normalize_base_url(raw: &str) -> Result<String, String> {
213    let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
214    let scheme = parsed.scheme();
215    if scheme != "http" && scheme != "https" {
216        return Err(format!(
217            "unsupported URL scheme '{}' — only http:// and https:// are allowed",
218            scheme
219        ));
220    }
221    Ok(parsed.to_string().trim_end_matches('/').to_string())
222}
223
224/// Validate that a base URL does not point to a private/loopback address.
225/// Call this on user-supplied config (at configure time) to prevent SSRF.
226/// Not called for programmatically constructed configs (e.g. tests).
227///
228/// **Loopback is allowed.** Self-hosted embedding backends (e.g. Ollama at
229/// `http://127.0.0.1:11434`) are a primary use case for `aft_search`. Loopback
230/// addresses by definition cannot be exploited as SSRF targets — they only
231/// reach services on the same machine. Allowing loopback unblocks Ollama at its
232/// default config without opening up SSRF to LAN/intranet services, which
233/// remain rejected.
234///
235/// **mDNS `.local` is rejected.** mDNS hostnames typically resolve to LAN
236/// devices (printers, homelab servers); rejecting them before DNS lookup keeps
237/// the SSRF guard meaningful for non-loopback private networks.
238pub fn validate_base_url_no_ssrf(raw: &str) -> Result<(), String> {
239    use std::net::{IpAddr, ToSocketAddrs};
240
241    let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
242
243    let host = parsed.host_str().unwrap_or("");
244
245    // Loopback hostnames are explicitly allowed. RFC 6761 mandates that
246    // `localhost` and `*.localhost` resolve to loopback;
247    // `localhost.localdomain` is a historical alias used on some Linux
248    // distros. Self-hosted backends like Ollama use these by default.
249    let is_loopback_host =
250        host == "localhost" || host == "localhost.localdomain" || host.ends_with(".localhost");
251    if is_loopback_host {
252        return Ok(());
253    }
254
255    // mDNS hostnames are typically LAN devices, not loopback. Reject before
256    // DNS lookup so users get a clear error rather than a private-IP error.
257    if host.ends_with(".local") {
258        return Err(format!(
259            "base_url host '{host}' is an mDNS name — only loopback (localhost / 127.0.0.1) and public endpoints are allowed"
260        ));
261    }
262
263    // Resolve the hostname. Reject private/link-local/CGNAT IPs but NOT
264    // loopback (which is by definition same-machine and not an SSRF target).
265    let port = parsed.port_or_known_default().unwrap_or(443);
266    let addr_str = format!("{host}:{port}");
267    let addrs: Vec<IpAddr> = addr_str
268        .to_socket_addrs()
269        .map(|iter| iter.map(|sa| sa.ip()).collect())
270        .unwrap_or_default();
271    for ip in &addrs {
272        if is_private_non_loopback_ip(ip) {
273            return Err(format!(
274                "base_url '{raw}' resolves to a private/reserved IP — only loopback (127.0.0.1) and public endpoints are allowed"
275            ));
276        }
277    }
278
279    Ok(())
280}
281
282/// Returns true for IPv4/IPv6 addresses in private/link-local/CGNAT/wildcard
283/// ranges, EXCLUDING loopback (127.0.0.0/8 and ::1). Loopback is considered
284/// safe for SSRF purposes — see [`validate_base_url_no_ssrf`] for rationale.
285fn is_private_non_loopback_ip(ip: &std::net::IpAddr) -> bool {
286    use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
287    match ip {
288        IpAddr::V4(v4) => {
289            let o = v4.octets();
290            // Note: 127.0.0.0/8 (loopback) is intentionally NOT in this set.
291            // 10.0.0.0/8
292            o[0] == 10
293            // 172.16.0.0/12
294            || (o[0] == 172 && (16..=31).contains(&o[1]))
295            // 192.168.0.0/16
296            || (o[0] == 192 && o[1] == 168)
297            // 169.254.0.0/16 link-local
298            || (o[0] == 169 && o[1] == 254)
299            // 100.64.0.0/10 CGNAT
300            || (o[0] == 100 && (64..=127).contains(&o[1]))
301            // 0.0.0.0/8 wildcard
302            || o[0] == 0
303        }
304        IpAddr::V6(v6) => {
305            // Note: ::1 (loopback) is intentionally NOT in this set.
306            let _ = Ipv6Addr::LOCALHOST; // touch to silence unused-import lints in some builds
307                                         // fe80::/10 link-local
308            (v6.segments()[0] & 0xffc0) == 0xfe80
309            // fc00::/7 unique-local
310            || (v6.segments()[0] & 0xfe00) == 0xfc00
311            // ::ffff:0:0/96 IPv4-mapped — check the embedded IPv4
312            || (v6.segments()[0] == 0 && v6.segments()[1] == 0
313                && v6.segments()[2] == 0 && v6.segments()[3] == 0
314                && v6.segments()[4] == 0 && v6.segments()[5] == 0xffff
315                && {
316                    let [a, b] = v6.segments()[6..8] else { return false; };
317                    let ipv4 = Ipv4Addr::new((a >> 8) as u8, (a & 0xff) as u8, (b >> 8) as u8, (b & 0xff) as u8);
318                    is_private_non_loopback_ip(&IpAddr::V4(ipv4))
319                })
320        }
321    }
322}
323
324fn build_openai_embeddings_endpoint(base_url: &str) -> String {
325    if base_url.ends_with("/v1") {
326        format!("{base_url}{DEFAULT_OPENAI_EMBEDDING_PATH}")
327    } else {
328        format!("{base_url}/v1{}", DEFAULT_OPENAI_EMBEDDING_PATH)
329    }
330}
331
332fn build_ollama_embeddings_endpoint(base_url: &str) -> String {
333    if base_url.ends_with("/api") {
334        format!("{base_url}/embed")
335    } else {
336        format!("{base_url}{DEFAULT_OLLAMA_EMBEDDING_PATH}")
337    }
338}
339
340fn normalize_api_key(value: Option<String>) -> Option<String> {
341    value.and_then(|token| {
342        let token = token.trim();
343        if token.is_empty() {
344            None
345        } else {
346            Some(token.to_string())
347        }
348    })
349}
350
351fn is_retryable_embedding_status(status: reqwest::StatusCode) -> bool {
352    status.is_server_error() || status == reqwest::StatusCode::TOO_MANY_REQUESTS
353}
354
355fn is_retryable_embedding_error(error: &reqwest::Error) -> bool {
356    error.is_connect()
357}
358
359fn sleep_before_embedding_retry(attempt_index: usize) {
360    if let Some(delay_ms) = EMBEDDING_REQUEST_BACKOFF_MS.get(attempt_index) {
361        std::thread::sleep(Duration::from_millis(*delay_ms));
362    }
363}
364
365fn send_embedding_request<F>(mut make_request: F, backend_label: &str) -> Result<String, String>
366where
367    F: FnMut() -> reqwest::blocking::RequestBuilder,
368{
369    for attempt_index in 0..EMBEDDING_REQUEST_MAX_ATTEMPTS {
370        let last_attempt = attempt_index + 1 == EMBEDDING_REQUEST_MAX_ATTEMPTS;
371
372        let response = match make_request().send() {
373            Ok(response) => response,
374            Err(error) => {
375                if !last_attempt && is_retryable_embedding_error(&error) {
376                    sleep_before_embedding_retry(attempt_index);
377                    continue;
378                }
379                return Err(format!("{backend_label} request failed: {error}"));
380            }
381        };
382
383        let status = response.status();
384        let raw = match response.text() {
385            Ok(raw) => raw,
386            Err(error) => {
387                if !last_attempt && is_retryable_embedding_error(&error) {
388                    sleep_before_embedding_retry(attempt_index);
389                    continue;
390                }
391                return Err(format!("{backend_label} response read failed: {error}"));
392            }
393        };
394
395        if status.is_success() {
396            return Ok(raw);
397        }
398
399        if !last_attempt && is_retryable_embedding_status(status) {
400            sleep_before_embedding_retry(attempt_index);
401            continue;
402        }
403
404        return Err(format!(
405            "{backend_label} request failed (HTTP {}): {}",
406            status, raw
407        ));
408    }
409
410    unreachable!("embedding request retries exhausted without returning")
411}
412
413impl SemanticEmbeddingModel {
414    pub fn from_config(config: &SemanticBackendConfig) -> Result<Self, String> {
415        let timeout_ms = if config.timeout_ms == 0 {
416            DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS
417        } else {
418            config.timeout_ms
419        };
420
421        let max_batch_size = if config.max_batch_size == 0 {
422            DEFAULT_MAX_BATCH_SIZE
423        } else {
424            config.max_batch_size
425        };
426
427        let api_key_env = normalize_api_key(config.api_key_env.clone());
428        let model = config.model.clone();
429
430        let client = Client::builder()
431            .timeout(Duration::from_millis(timeout_ms))
432            .redirect(reqwest::redirect::Policy::none())
433            .build()
434            .map_err(|error| format!("failed to configure embedding client: {error}"))?;
435
436        let engine = match config.backend {
437            SemanticBackend::Fastembed => {
438                SemanticEmbeddingEngine::Local(LocalEmbedder::new(&model)?)
439            }
440            SemanticBackend::OpenAiCompatible => {
441                let raw = config.base_url.as_ref().ok_or_else(|| {
442                    "base_url is required for openai_compatible backend".to_string()
443                })?;
444                let base_url = normalize_base_url(raw)?;
445
446                let api_key = match api_key_env {
447                    Some(var_name) => Some(env::var(&var_name).map_err(|_| {
448                        format!("missing api_key_env '{var_name}' for openai_compatible backend")
449                    })?),
450                    None => None,
451                };
452
453                SemanticEmbeddingEngine::OpenAiCompatible {
454                    client,
455                    model,
456                    base_url,
457                    api_key,
458                }
459            }
460            SemanticBackend::Ollama => {
461                let raw = config
462                    .base_url
463                    .as_ref()
464                    .ok_or_else(|| "base_url is required for ollama backend".to_string())?;
465                let base_url = normalize_base_url(raw)?;
466
467                SemanticEmbeddingEngine::Ollama {
468                    client,
469                    model,
470                    base_url,
471                }
472            }
473        };
474
475        Ok(Self {
476            backend: config.backend,
477            model: config.model.clone(),
478            base_url: config.base_url.clone(),
479            timeout_ms,
480            max_batch_size,
481            dimension: None,
482            engine,
483            query_embedding_cache: HashMap::new(),
484            query_embedding_cache_order: VecDeque::new(),
485            query_embedding_cache_hits: 0,
486            query_embedding_cache_misses: 0,
487        })
488    }
489
490    pub fn backend(&self) -> SemanticBackend {
491        self.backend
492    }
493
494    pub fn model(&self) -> &str {
495        &self.model
496    }
497
498    pub fn base_url(&self) -> Option<&str> {
499        self.base_url.as_deref()
500    }
501
502    pub fn max_batch_size(&self) -> usize {
503        self.max_batch_size
504    }
505
506    pub fn timeout_ms(&self) -> u64 {
507        self.timeout_ms
508    }
509
510    pub fn fingerprint(
511        &mut self,
512        config: &SemanticBackendConfig,
513    ) -> Result<SemanticIndexFingerprint, String> {
514        let dimension = self.dimension()?;
515        Ok(SemanticIndexFingerprint::from_config(config, dimension))
516    }
517
518    pub fn dimension(&mut self) -> Result<usize, String> {
519        if let Some(dimension) = self.dimension {
520            return Ok(dimension);
521        }
522
523        let dimension = match &mut self.engine {
524            SemanticEmbeddingEngine::Local(model) => {
525                let vectors = model.embed(&["semantic index fingerprint probe".to_string()])?;
526                vectors
527                    .first()
528                    .map(|v| v.len())
529                    .ok_or_else(|| "embedding backend returned no vectors".to_string())?
530            }
531            SemanticEmbeddingEngine::OpenAiCompatible { .. } => {
532                let vectors =
533                    self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
534                vectors
535                    .first()
536                    .map(|v| v.len())
537                    .ok_or_else(|| "embedding backend returned no vectors".to_string())?
538            }
539            SemanticEmbeddingEngine::Ollama { .. } => {
540                let vectors =
541                    self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
542                vectors
543                    .first()
544                    .map(|v| v.len())
545                    .ok_or_else(|| "embedding backend returned no vectors".to_string())?
546            }
547        };
548
549        self.dimension = Some(dimension);
550        Ok(dimension)
551    }
552
553    pub fn embed(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
554        self.embed_texts(texts)
555    }
556
557    pub fn embed_query_cached(&mut self, query: &str) -> Result<Vec<f32>, String> {
558        if let Some(vector) = self.query_embedding_cache.get(query) {
559            self.query_embedding_cache_hits += 1;
560            return Ok(vector.clone());
561        }
562
563        self.query_embedding_cache_misses += 1;
564        let embeddings = self.embed_texts(vec![query.to_string()])?;
565        let vector = embeddings
566            .first()
567            .cloned()
568            .ok_or_else(|| "embedding model returned no query vector".to_string())?;
569
570        if self.query_embedding_cache.len() >= QUERY_EMBEDDING_CACHE_CAP {
571            if let Some(oldest) = self.query_embedding_cache_order.pop_front() {
572                self.query_embedding_cache.remove(&oldest);
573            }
574        }
575        self.query_embedding_cache
576            .insert(query.to_string(), vector.clone());
577        self.query_embedding_cache_order
578            .push_back(query.to_string());
579
580        Ok(vector)
581    }
582
583    pub fn query_embedding_cache_stats(&self) -> (u64, u64, usize) {
584        (
585            self.query_embedding_cache_hits,
586            self.query_embedding_cache_misses,
587            self.query_embedding_cache.len(),
588        )
589    }
590
591    fn embed_texts(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
592        match &mut self.engine {
593            SemanticEmbeddingEngine::Local(model) => model
594                .embed(&texts)
595                .map_err(|error| format!("failed to embed batch: {error}")),
596            SemanticEmbeddingEngine::OpenAiCompatible {
597                client,
598                model,
599                base_url,
600                api_key,
601            } => {
602                let expected_text_count = texts.len();
603                let endpoint = build_openai_embeddings_endpoint(base_url);
604                let body = serde_json::json!({
605                    "input": texts,
606                    "model": model,
607                });
608
609                let raw = send_embedding_request(
610                    || {
611                        // `.json(&body)` sets Content-Type: application/json
612                        // automatically. Do NOT add `.header("Content-Type",
613                        // "application/json")` afterwards — RequestBuilder::header()
614                        // calls HeaderMap::append, which produces TWO Content-Type
615                        // headers on the wire. OpenAI's /v1/embeddings endpoint
616                        // treats duplicate Content-Type as malformed and rejects
617                        // the body with 400 "you must provide a model parameter"
618                        // even when `model` is set. Verified end-to-end against
619                        // api.openai.com. See issue #36.
620                        let mut request = client.post(&endpoint).json(&body);
621
622                        if let Some(api_key) = api_key {
623                            request = request.header("Authorization", format!("Bearer {api_key}"));
624                        }
625
626                        request
627                    },
628                    "openai compatible",
629                )?;
630
631                #[derive(Deserialize)]
632                struct OpenAiResponse {
633                    data: Vec<OpenAiEmbeddingResult>,
634                }
635
636                #[derive(Deserialize)]
637                struct OpenAiEmbeddingResult {
638                    embedding: Vec<f32>,
639                    index: Option<u32>,
640                }
641
642                let parsed: OpenAiResponse = serde_json::from_str(&raw)
643                    .map_err(|error| format!("invalid openai compatible response: {error}"))?;
644                if parsed.data.len() != expected_text_count {
645                    return Err(format!(
646                        "openai compatible response returned {} embeddings for {} inputs",
647                        parsed.data.len(),
648                        expected_text_count
649                    ));
650                }
651
652                let mut vectors = vec![Vec::new(); parsed.data.len()];
653                for (i, item) in parsed.data.into_iter().enumerate() {
654                    let index = item.index.unwrap_or(i as u32) as usize;
655                    if index >= vectors.len() {
656                        return Err(
657                            "openai compatible response contains invalid vector index".to_string()
658                        );
659                    }
660                    vectors[index] = item.embedding;
661                }
662
663                for vector in &vectors {
664                    if vector.is_empty() {
665                        return Err(
666                            "openai compatible response contained missing vectors".to_string()
667                        );
668                    }
669                }
670
671                self.dimension = vectors.first().map(Vec::len);
672                Ok(vectors)
673            }
674            SemanticEmbeddingEngine::Ollama {
675                client,
676                model,
677                base_url,
678            } => {
679                let expected_text_count = texts.len();
680                let endpoint = build_ollama_embeddings_endpoint(base_url);
681
682                #[derive(Serialize)]
683                struct OllamaPayload<'a> {
684                    model: &'a str,
685                    input: Vec<String>,
686                }
687
688                let payload = OllamaPayload {
689                    model,
690                    input: texts,
691                };
692
693                let raw = send_embedding_request(
694                    || {
695                        // `.json(&payload)` sets Content-Type automatically.
696                        // Same duplicate-header trap as the OpenAI branch above
697                        // — most Ollama servers tolerate it, but the
698                        // single-Content-Type form is the correct one.
699                        client.post(&endpoint).json(&payload)
700                    },
701                    "ollama",
702                )?;
703
704                #[derive(Deserialize)]
705                struct OllamaResponse {
706                    embeddings: Vec<Vec<f32>>,
707                }
708
709                let parsed: OllamaResponse = serde_json::from_str(&raw)
710                    .map_err(|error| format!("invalid ollama response: {error}"))?;
711                if parsed.embeddings.is_empty() {
712                    return Err("ollama response returned no embeddings".to_string());
713                }
714                if parsed.embeddings.len() != expected_text_count {
715                    return Err(format!(
716                        "ollama response returned {} embeddings for {} inputs",
717                        parsed.embeddings.len(),
718                        expected_text_count
719                    ));
720                }
721
722                let vectors = parsed.embeddings;
723                for vector in &vectors {
724                    if vector.is_empty() {
725                        return Err("ollama response contained empty embeddings".to_string());
726                    }
727                }
728
729                self.dimension = vectors.first().map(Vec::len);
730                Ok(vectors)
731            }
732        }
733    }
734}
735
736/// Pre-validate ONNX Runtime by attempting a raw dlopen before ort touches it.
737/// This catches broken/incompatible .so files without risking a panic in the ort crate.
738/// Also checks the runtime version via OrtGetApiBase if available.
739pub fn pre_validate_onnx_runtime() -> Result<(), String> {
740    let dylib_path = std::env::var("ORT_DYLIB_PATH").ok();
741
742    #[cfg(any(target_os = "linux", target_os = "macos"))]
743    {
744        #[cfg(target_os = "linux")]
745        let default_name = "libonnxruntime.so";
746        #[cfg(target_os = "macos")]
747        let default_name = "libonnxruntime.dylib";
748
749        let lib_name = dylib_path.as_deref().unwrap_or(default_name);
750
751        unsafe {
752            let c_name = std::ffi::CString::new(lib_name)
753                .map_err(|e| format!("invalid library path: {}", e))?;
754            let handle = libc::dlopen(c_name.as_ptr(), libc::RTLD_NOW);
755            if handle.is_null() {
756                let err = libc::dlerror();
757                let msg = if err.is_null() {
758                    "unknown dlopen error".to_string()
759                } else {
760                    std::ffi::CStr::from_ptr(err).to_string_lossy().into_owned()
761                };
762                return Err(format!(
763                    "ONNX Runtime not found. dlopen('{}') failed: {}. \
764                     Run `npx @cortexkit/aft doctor` to diagnose.",
765                    lib_name, msg
766                ));
767            }
768
769            // Try to detect the runtime version from the file path or soname.
770            // libonnxruntime.so.1.19.0, libonnxruntime.1.24.4.dylib, etc.
771            let detected_version = detect_ort_version_from_path(lib_name);
772
773            libc::dlclose(handle);
774
775            // Check version compatibility — we need 1.24.x
776            if let Some(ref version) = detected_version {
777                let parts: Vec<&str> = version.split('.').collect();
778                if let (Some(major), Some(minor)) = (
779                    parts.first().and_then(|s| s.parse::<u32>().ok()),
780                    parts.get(1).and_then(|s| s.parse::<u32>().ok()),
781                ) {
782                    if major != 1 || minor < 20 {
783                        return Err(format_ort_version_mismatch(version, lib_name));
784                    }
785                }
786            }
787        }
788    }
789
790    #[cfg(target_os = "windows")]
791    {
792        // Validate ONNX Runtime availability on Windows by loading the DLL
793        // via LoadLibraryExW before the ort crate attempts its own LoadLibrary.
794        // This way we can produce a friendly error (with installation hints)
795        // instead of a raw LoadLibrary failure from deep inside fastembed.
796        let lib_name = dylib_path.as_deref().unwrap_or("onnxruntime.dll");
797
798        // Use kernel32 LoadLibraryExW for the validation — built-in, no
799        // crate dependency required. GetModuleFileNameW resolves the loaded
800        // DLL path for version probing via the version.dll API.
801        #[link(name = "kernel32")]
802        extern "system" {
803            fn LoadLibraryExW(
804                lpLibFileName: *const u16,
805                hFile: *mut std::ffi::c_void,
806                dwFlags: u32,
807            ) -> *mut std::ffi::c_void;
808            fn FreeLibrary(hLibModule: *mut std::ffi::c_void) -> i32;
809            fn GetModuleFileNameW(
810                hModule: *mut std::ffi::c_void,
811                lpFilename: *mut u16,
812                nSize: u32,
813            ) -> u32;
814        }
815
816        #[link(name = "version")]
817        extern "system" {
818            fn GetFileVersionInfoSizeW(lptstrFilename: *const u16, lpdwHandle: *mut u32) -> u32;
819            fn GetFileVersionInfoW(
820                lptstrFilename: *const u16,
821                dwHandle: u32,
822                dwLen: u32,
823                lpData: *mut std::ffi::c_void,
824            ) -> i32;
825            fn VerQueryValueW(
826                pBlock: *mut std::ffi::c_void,
827                lpSubBlock: *const u16,
828                lplpBuffer: *mut *mut std::ffi::c_void,
829                puLen: *mut u32,
830            ) -> i32;
831        }
832
833        #[repr(C)]
834        struct VS_FIXEDFILEINFO {
835            dw_signature: u32,
836            dw_struc_version: u32,
837            dw_file_version_ms: u32, // HIWORD major, LOWORD minor
838            dw_file_version_ls: u32, // HIWORD build, LOWORD revision
839            dw_product_version_ms: u32,
840            dw_product_version_ls: u32,
841            dw_file_flags_mask: u32,
842            dw_file_flags: u32,
843            dw_file_os: u32,
844            dw_file_type: u32,
845            dw_file_subtype: u32,
846            dw_file_date_ms: u32,
847            dw_file_date_ls: u32,
848        }
849
850        unsafe {
851            use std::os::windows::ffi::OsStrExt;
852            let wide: Vec<u16> = std::ffi::OsStr::new(lib_name)
853                .encode_wide()
854                .chain(std::iter::once(0))
855                .collect();
856
857            let handle = LoadLibraryExW(wide.as_ptr(), std::ptr::null_mut(), 0);
858            if handle.is_null() {
859                let err = std::io::Error::last_os_error();
860                return Err(format!(
861                    "ONNX Runtime not found. LoadLibraryExW('{}') failed: {}. \
862                     Run `npx @cortexkit/aft doctor` to diagnose.",
863                    lib_name, err
864                ));
865            }
866
867            // Probe the file version from PE resources so we can reject
868            // outdated DLLs (e.g. v1.9.x) before the ort crate panics.
869            let mut detected_major: u32 = 0;
870            let mut detected_minor: u32 = 0;
871            // Use MAX_UNICODEPATH (32767) so deeply nested ORT paths (e.g.
872            // long NuGet package paths under %USERPROFILE%) never truncate.
873            // GetModuleFileNameW truncates silently when the buffer is too
874            // small, which causes version probing to fail and the version
875            // check to be bypassed — better to allocate generously.
876            let mut path_buf = [0u16; 32767];
877            let path_len = GetModuleFileNameW(handle, path_buf.as_mut_ptr(), 32767);
878            if path_len > 0 {
879                let mut dummy_handle: u32 = 0;
880                let info_size = GetFileVersionInfoSizeW(path_buf.as_ptr(), &mut dummy_handle);
881                if info_size > 0 {
882                    let mut info = vec![0u8; info_size as usize];
883                    if GetFileVersionInfoW(
884                        path_buf.as_ptr(),
885                        0,
886                        info_size,
887                        info.as_mut_ptr() as *mut std::ffi::c_void,
888                    ) != 0
889                    {
890                        let sub_block = "\\\0".encode_utf16().collect::<Vec<u16>>();
891                        let mut vs_info: *mut std::ffi::c_void = std::ptr::null_mut();
892                        let mut vs_len: u32 = 0;
893                        if VerQueryValueW(
894                            info.as_mut_ptr() as *mut std::ffi::c_void,
895                            sub_block.as_ptr(),
896                            &mut vs_info,
897                            &mut vs_len,
898                        ) != 0
899                            && !vs_info.is_null()
900                        {
901                            let fixed = vs_info as *const VS_FIXEDFILEINFO;
902                            detected_major = (*fixed).dw_file_version_ms >> 16;
903                            detected_minor = (*fixed).dw_file_version_ms & 0xFFFF;
904                        }
905                    }
906                }
907            }
908
909            FreeLibrary(handle);
910
911            // Version compatibility check (mirrors the Linux/macOS path).
912            // If version could not be detected (detected_major == 0) we let
913            // the load succeed — the ort crate will diagnose further.
914            if detected_major != 0 && (detected_major != 1 || detected_minor < 20) {
915                let ver = format!("{}.{}", detected_major, detected_minor);
916                return Err(format_ort_version_mismatch(&ver, lib_name));
917            }
918        }
919    }
920
921    Ok(())
922}
923
924/// Try to extract the ORT version from the library filename or resolved symlink.
925/// Examples: "libonnxruntime.so.1.19.0" → "1.19.0", "libonnxruntime.1.24.4.dylib" → "1.24.4"
926#[cfg(any(target_os = "linux", target_os = "macos"))]
927fn detect_ort_version_from_path(lib_path: &str) -> Option<String> {
928    let path = std::path::Path::new(lib_path);
929
930    // Try the path as given, then follow symlinks
931    for candidate in [Some(path.to_path_buf()), std::fs::canonicalize(path).ok()]
932        .into_iter()
933        .flatten()
934    {
935        if let Some(name) = candidate.file_name().and_then(|n| n.to_str()) {
936            if let Some(version) = extract_version_from_filename(name) {
937                return Some(version);
938            }
939        }
940    }
941
942    // Also check for versioned siblings in the same directory
943    if let Some(parent) = path.parent() {
944        if let Ok(entries) = std::fs::read_dir(parent) {
945            for entry in entries.flatten() {
946                if let Some(name) = entry.file_name().to_str() {
947                    if name.starts_with("libonnxruntime") {
948                        if let Some(version) = extract_version_from_filename(name) {
949                            return Some(version);
950                        }
951                    }
952                }
953            }
954        }
955    }
956
957    None
958}
959
960/// Extract version from filenames like "libonnxruntime.so.1.19.0" or "libonnxruntime.1.24.4.dylib"
961#[cfg(any(target_os = "linux", target_os = "macos"))]
962fn extract_version_from_filename(name: &str) -> Option<String> {
963    // Match patterns: .so.X.Y.Z or .X.Y.Z.dylib or .X.Y.Z.so
964    let re = regex::Regex::new(r"(\d+\.\d+\.\d+)").ok()?;
965    re.find(name).map(|m| m.as_str().to_string())
966}
967
968fn suggest_removal_command(lib_path: &str) -> String {
969    if lib_path.starts_with("/usr/local/lib")
970        || lib_path == "libonnxruntime.so"
971        || lib_path == "libonnxruntime.dylib"
972    {
973        #[cfg(target_os = "linux")]
974        return "   sudo rm /usr/local/lib/libonnxruntime* && sudo ldconfig".to_string();
975        #[cfg(target_os = "macos")]
976        return "   sudo rm /usr/local/lib/libonnxruntime*".to_string();
977    }
978    format!("   rm '{}'", lib_path)
979}
980
981/// Build the user-facing error message for an incompatible ONNX Runtime
982/// install. Extracted as a pure helper so we can unit-test the wording
983/// stability — the auto-fix recommendation must always come first because
984/// it's the only safe option, and the system-rm step must remain present
985/// because some users prefer the system-wide cleanup path.
986pub(crate) fn format_ort_version_mismatch(version: &str, lib_name: &str) -> String {
987    format!(
988        "ONNX Runtime version mismatch: found v{} at '{}', but AFT requires v1.20+. \
989         Solutions:\n\
990         1. Auto-fix (recommended): run `npx @cortexkit/aft doctor --fix`. \
991         This downloads AFT-managed ONNX Runtime v1.24 into AFT's storage and \
992         configures the bridge to load it instead of the system library — no \
993         changes to '{}'.\n\
994         2. Remove the old library and restart (AFT auto-downloads the correct version on next start):\n\
995         {}\n\
996         3. Or install ONNX Runtime 1.24 system-wide: https://github.com/microsoft/onnxruntime/releases/tag/v1.24.0\n\
997         4. Run `npx @cortexkit/aft doctor` for full diagnostics.",
998        version,
999        lib_name,
1000        lib_name,
1001        suggest_removal_command(lib_name),
1002    )
1003}
1004
1005pub fn is_onnx_runtime_unavailable(message: &str) -> bool {
1006    if message.trim_start().starts_with("ONNX Runtime not found.") {
1007        return true;
1008    }
1009
1010    let message = message.to_ascii_lowercase();
1011    let mentions_onnx_runtime = ["onnx runtime", "onnxruntime", "libonnxruntime"]
1012        .iter()
1013        .any(|pattern| message.contains(pattern));
1014    let mentions_dynamic_load_failure = [
1015        "shared library",
1016        "dynamic library",
1017        "failed to load",
1018        "could not load",
1019        "unable to load",
1020        "dlopen",
1021        "loadlibrary",
1022        "no such file",
1023        "not found",
1024    ]
1025    .iter()
1026    .any(|pattern| message.contains(pattern));
1027
1028    mentions_onnx_runtime && mentions_dynamic_load_failure
1029}
1030
1031pub fn format_embedding_init_error(error: impl Display) -> String {
1032    let message = error.to_string();
1033
1034    if is_onnx_runtime_unavailable(&message) {
1035        return format!("{ONNX_RUNTIME_INSTALL_HINT} Original error: {message}");
1036    }
1037
1038    format!("failed to initialize semantic embedding model: {message}")
1039}
1040
1041/// A chunk of code ready for embedding — derived from a Symbol with context enrichment
1042#[derive(Debug, Clone)]
1043pub struct SemanticChunk {
1044    /// Absolute file path
1045    pub file: PathBuf,
1046    /// Symbol name
1047    pub name: String,
1048    /// Symbol kind (function, class, struct, etc.)
1049    pub kind: SymbolKind,
1050    /// Line range (0-based internally, inclusive)
1051    pub start_line: u32,
1052    pub end_line: u32,
1053    /// Whether the symbol is exported
1054    pub exported: bool,
1055    /// The enriched text that gets embedded (scope + signature + body snippet)
1056    pub embed_text: String,
1057    /// Short code snippet for display in results
1058    pub snippet: String,
1059}
1060
1061/// A stored embedding entry — chunk metadata + vector
1062#[derive(Debug, Clone)]
1063pub struct EmbeddingEntry {
1064    chunk: SemanticChunk,
1065    vector: Vec<f32>,
1066}
1067
1068/// The semantic index — stores embeddings for all symbols in a project
1069#[derive(Debug, Clone)]
1070pub struct SemanticIndex {
1071    entries: Vec<EmbeddingEntry>,
1072    /// Track which files are indexed and their mtime for staleness detection
1073    file_mtimes: HashMap<PathBuf, SystemTime>,
1074    /// Track indexed file sizes alongside mtimes for staleness detection
1075    file_sizes: HashMap<PathBuf, u64>,
1076    file_hashes: HashMap<PathBuf, blake3::Hash>,
1077    /// Embedding dimension (384 for MiniLM-L6-v2)
1078    dimension: usize,
1079    fingerprint: Option<SemanticIndexFingerprint>,
1080    project_root: PathBuf,
1081    deferred_files: HashSet<PathBuf>,
1082}
1083
1084#[derive(Debug, Clone, Copy)]
1085struct IndexedFileMetadata {
1086    mtime: SystemTime,
1087    size: u64,
1088    content_hash: blake3::Hash,
1089}
1090
1091/// Result of an incremental refresh of the semantic index. Counts are file
1092/// counts; `total_processed` is the number of current/deleted files considered.
1093#[derive(Debug, Default, Clone, Copy)]
1094pub struct RefreshSummary {
1095    pub changed: usize,
1096    pub added: usize,
1097    pub deleted: usize,
1098    pub total_processed: usize,
1099}
1100
1101impl RefreshSummary {
1102    /// True when no files were touched.
1103    pub fn is_noop(&self) -> bool {
1104        self.changed == 0 && self.added == 0 && self.deleted == 0
1105    }
1106}
1107
1108#[derive(Debug, Default)]
1109pub struct InvalidatedFilesRefresh {
1110    pub added_entries: Vec<EmbeddingEntry>,
1111    pub updated_metadata: Vec<(PathBuf, FileFreshness)>,
1112    pub completed_paths: Vec<PathBuf>,
1113    pub summary: RefreshSummary,
1114}
1115
1116/// Search result from a semantic query
1117#[derive(Debug, Clone)]
1118pub struct SemanticResult {
1119    pub file: PathBuf,
1120    pub name: String,
1121    pub kind: SymbolKind,
1122    pub start_line: u32,
1123    pub end_line: u32,
1124    pub exported: bool,
1125    pub snippet: String,
1126    pub score: f32,
1127    pub source: &'static str,
1128}
1129
1130impl SemanticIndex {
1131    pub fn new(project_root: PathBuf, dimension: usize) -> Self {
1132        debug_assert!(project_root.is_absolute());
1133        Self {
1134            entries: Vec::new(),
1135            file_mtimes: HashMap::new(),
1136            file_sizes: HashMap::new(),
1137            file_hashes: HashMap::new(),
1138            dimension,
1139            fingerprint: None,
1140            project_root,
1141            deferred_files: HashSet::new(),
1142        }
1143    }
1144
1145    /// Number of embedded symbol entries.
1146    pub fn entry_count(&self) -> usize {
1147        self.entries.len()
1148    }
1149
1150    /// Number of files currently tracked by the semantic index.
1151    pub fn indexed_file_count(&self) -> usize {
1152        self.file_mtimes.len()
1153    }
1154
1155    /// Human-readable status label for the index.
1156    pub fn status_label(&self) -> &'static str {
1157        if self.entries.is_empty() {
1158            "empty"
1159        } else {
1160            "ready"
1161        }
1162    }
1163
1164    fn collect_chunks(
1165        project_root: &Path,
1166        files: &[PathBuf],
1167    ) -> (Vec<SemanticChunk>, HashMap<PathBuf, IndexedFileMetadata>) {
1168        let collect_started = std::time::Instant::now();
1169        let per_file: Vec<(
1170            PathBuf,
1171            Result<(IndexedFileMetadata, Vec<SemanticChunk>), String>,
1172        )> = files
1173            .par_iter()
1174            .map_init(HashMap::new, |parsers, file| {
1175                let result = collect_file_metadata(file).and_then(|metadata| {
1176                    collect_file_chunks(project_root, file, parsers)
1177                        .map(|chunks| (metadata, chunks))
1178                });
1179                (file.clone(), result)
1180            })
1181            .collect();
1182
1183        let mut chunks: Vec<SemanticChunk> = Vec::new();
1184        let mut file_metadata: HashMap<PathBuf, IndexedFileMetadata> = HashMap::new();
1185
1186        for (file, result) in per_file {
1187            match result {
1188                Ok((metadata, file_chunks)) => {
1189                    file_metadata.insert(file, metadata);
1190                    chunks.extend(file_chunks);
1191                }
1192                Err(error) => {
1193                    // "unsupported file extension" is expected for non-code files
1194                    // (json, xml, .gitignore, etc.) that get included in the
1195                    // project walk. Pre-fix this was swallowed by .unwrap_or_default();
1196                    // we now skip silently to keep the log clean. Only real read/parse
1197                    // errors are worth surfacing.
1198                    if error == "unsupported file extension" {
1199                        continue;
1200                    }
1201                    slog_warn!(
1202                        "failed to collect semantic chunks for {}: {}",
1203                        file.display(),
1204                        error
1205                    );
1206                }
1207            }
1208        }
1209
1210        slog_info!(
1211            "semantic collect: {} chunks from {} files in {} ms",
1212            chunks.len(),
1213            file_metadata.len(),
1214            collect_started.elapsed().as_millis()
1215        );
1216
1217        (chunks, file_metadata)
1218    }
1219
1220    fn build_from_chunks<F, P>(
1221        project_root: &Path,
1222        chunks: Vec<SemanticChunk>,
1223        file_metadata: HashMap<PathBuf, IndexedFileMetadata>,
1224        embed_fn: &mut F,
1225        max_batch_size: usize,
1226        mut progress: Option<&mut P>,
1227    ) -> Result<Self, String>
1228    where
1229        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1230        P: FnMut(usize, usize),
1231    {
1232        debug_assert!(project_root.is_absolute());
1233        let total_chunks = chunks.len();
1234
1235        if chunks.is_empty() {
1236            return Ok(Self {
1237                entries: Vec::new(),
1238                file_mtimes: file_metadata
1239                    .iter()
1240                    .map(|(path, metadata)| (path.clone(), metadata.mtime))
1241                    .collect(),
1242                file_sizes: file_metadata
1243                    .iter()
1244                    .map(|(path, metadata)| (path.clone(), metadata.size))
1245                    .collect(),
1246                file_hashes: file_metadata
1247                    .into_iter()
1248                    .map(|(path, metadata)| (path, metadata.content_hash))
1249                    .collect(),
1250                dimension: DEFAULT_DIMENSION,
1251                fingerprint: None,
1252                project_root: project_root.to_path_buf(),
1253                deferred_files: HashSet::new(),
1254            });
1255        }
1256
1257        // Embed in batches
1258        let mut entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1259        let mut expected_dimension: Option<usize> = None;
1260        let batch_size = max_batch_size.max(1);
1261        let embed_started = std::time::Instant::now();
1262        let batch_count = total_chunks.div_ceil(batch_size);
1263        for batch_start in (0..chunks.len()).step_by(batch_size) {
1264            let batch_end = (batch_start + batch_size).min(chunks.len());
1265            let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1266                .iter()
1267                .map(|c| c.embed_text.clone())
1268                .collect();
1269
1270            let vectors = embed_fn(batch_texts)?;
1271            validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1272
1273            // Track consistent dimension across all batches
1274            if let Some(dim) = vectors.first().map(|v| v.len()) {
1275                match expected_dimension {
1276                    None => expected_dimension = Some(dim),
1277                    Some(expected) if dim != expected => {
1278                        return Err(format!(
1279                            "embedding dimension changed across batches: expected {expected}, got {dim}"
1280                        ));
1281                    }
1282                    _ => {}
1283                }
1284            }
1285
1286            for (i, vector) in vectors.into_iter().enumerate() {
1287                let chunk_idx = batch_start + i;
1288                entries.push(EmbeddingEntry {
1289                    chunk: chunks[chunk_idx].clone(),
1290                    vector,
1291                });
1292            }
1293
1294            if let Some(callback) = progress.as_mut() {
1295                callback(entries.len(), total_chunks);
1296            }
1297        }
1298
1299        let embed_ms = embed_started.elapsed().as_millis();
1300        let rate = (total_chunks as u128 * 1000)
1301            .checked_div(embed_ms)
1302            .unwrap_or(0) as u64;
1303        slog_info!(
1304            "semantic embed: {} chunks in {} batches, {} ms ({} chunks/s)",
1305            total_chunks,
1306            batch_count,
1307            embed_ms,
1308            rate
1309        );
1310
1311        let dimension = entries
1312            .first()
1313            .map(|e| e.vector.len())
1314            .unwrap_or(DEFAULT_DIMENSION);
1315
1316        Ok(Self {
1317            entries,
1318            file_mtimes: file_metadata
1319                .iter()
1320                .map(|(path, metadata)| (path.clone(), metadata.mtime))
1321                .collect(),
1322            file_sizes: file_metadata
1323                .iter()
1324                .map(|(path, metadata)| (path.clone(), metadata.size))
1325                .collect(),
1326            file_hashes: file_metadata
1327                .into_iter()
1328                .map(|(path, metadata)| (path, metadata.content_hash))
1329                .collect(),
1330            dimension,
1331            fingerprint: None,
1332            project_root: project_root.to_path_buf(),
1333            deferred_files: HashSet::new(),
1334        })
1335    }
1336
1337    /// Build the semantic index from a set of files using the provided embedding function.
1338    /// `embed_fn` takes a batch of texts and returns a batch of embedding vectors.
1339    pub fn build<F>(
1340        project_root: &Path,
1341        files: &[PathBuf],
1342        embed_fn: &mut F,
1343        max_batch_size: usize,
1344    ) -> Result<Self, String>
1345    where
1346        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1347    {
1348        let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1349        Self::build_from_chunks(
1350            project_root,
1351            chunks,
1352            file_mtimes,
1353            embed_fn,
1354            max_batch_size,
1355            Option::<&mut fn(usize, usize)>::None,
1356        )
1357    }
1358
1359    /// Build the semantic index and report embedding progress using entry counts.
1360    pub fn build_with_progress<F, P>(
1361        project_root: &Path,
1362        files: &[PathBuf],
1363        embed_fn: &mut F,
1364        max_batch_size: usize,
1365        progress: &mut P,
1366    ) -> Result<Self, String>
1367    where
1368        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1369        P: FnMut(usize, usize),
1370    {
1371        let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1372        let total_chunks = chunks.len();
1373        progress(0, total_chunks);
1374        Self::build_from_chunks(
1375            project_root,
1376            chunks,
1377            file_mtimes,
1378            embed_fn,
1379            max_batch_size,
1380            Some(progress),
1381        )
1382    }
1383
1384    /// Incrementally refresh entries for changed/new files only, preserving cached
1385    /// embeddings for unchanged files. Used when loading the index from disk and
1386    /// finding that a small fraction of files have moved on, deleted, or appeared.
1387    ///
1388    /// Returns `RefreshSummary` describing what changed. On success, `self` is
1389    /// mutated in place and remains a valid index.
1390    ///
1391    /// `current_files` is the full set of files the project considers indexable
1392    /// (typically `walk_project_files(...)`). Files in the cache that are no
1393    /// longer in this set are treated as deleted.
1394    pub fn refresh_stale_files<F, P>(
1395        &mut self,
1396        project_root: &Path,
1397        current_files: &[PathBuf],
1398        embed_fn: &mut F,
1399        max_batch_size: usize,
1400        progress: &mut P,
1401    ) -> Result<RefreshSummary, String>
1402    where
1403        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1404        P: FnMut(usize, usize),
1405    {
1406        self.backfill_missing_file_sizes();
1407
1408        // 1. Bucket files into deleted / changed / added.
1409        let current_set: HashSet<&Path> = current_files.iter().map(PathBuf::as_path).collect();
1410        self.deferred_files
1411            .retain(|path| current_set.contains(path.as_path()));
1412        let total_processed = current_set.len() + self.file_mtimes.len()
1413            - self
1414                .file_mtimes
1415                .keys()
1416                .filter(|path| current_set.contains(path.as_path()))
1417                .count();
1418
1419        // Files in cache that disappeared from disk OR are no longer in the
1420        // walked set. Both cases need their entries dropped.
1421        let mut deleted: Vec<PathBuf> = Vec::new();
1422        let mut changed: Vec<PathBuf> = Vec::new();
1423        let indexed_paths: Vec<PathBuf> = self.file_mtimes.keys().cloned().collect();
1424        for indexed_path in &indexed_paths {
1425            if !current_set.contains(indexed_path.as_path()) {
1426                deleted.push(indexed_path.clone());
1427                continue;
1428            }
1429            let cached = match (
1430                self.file_mtimes.get(indexed_path),
1431                self.file_sizes.get(indexed_path),
1432                self.file_hashes.get(indexed_path),
1433            ) {
1434                (Some(mtime), Some(size), Some(hash)) => Some(FileFreshness {
1435                    mtime: *mtime,
1436                    size: *size,
1437                    content_hash: *hash,
1438                }),
1439                _ => None,
1440            };
1441            match cached
1442                .map(|freshness| cache_freshness::verify_file_strict(indexed_path, &freshness))
1443            {
1444                Some(FreshnessVerdict::HotFresh) => {}
1445                Some(FreshnessVerdict::ContentFresh {
1446                    new_mtime,
1447                    new_size,
1448                }) => {
1449                    self.file_mtimes.insert(indexed_path.clone(), new_mtime);
1450                    self.file_sizes.insert(indexed_path.clone(), new_size);
1451                }
1452                Some(FreshnessVerdict::Stale | FreshnessVerdict::Deleted) | None => {
1453                    changed.push(indexed_path.clone());
1454                }
1455            }
1456        }
1457
1458        // Files in walk that were never indexed.
1459        let mut added: Vec<PathBuf> = Vec::new();
1460        for path in current_files {
1461            if !self.file_mtimes.contains_key(path) {
1462                added.push(path.clone());
1463            }
1464        }
1465
1466        // Fast path: nothing to do.
1467        if deleted.is_empty() && changed.is_empty() && added.is_empty() {
1468            progress(0, 0);
1469            return Ok(RefreshSummary {
1470                total_processed,
1471                ..RefreshSummary::default()
1472            });
1473        }
1474
1475        // 2. Drop entries for deleted files immediately. Changed files are only
1476        //    replaced after successful re-extraction + embedding so transient
1477        //    read/parse errors keep the stale-but-valid cache entry.
1478        if !deleted.is_empty() {
1479            self.remove_indexed_files(&deleted);
1480        }
1481
1482        // 3. Embed the changed + added set, if any.
1483        let mut to_embed: Vec<PathBuf> = Vec::with_capacity(changed.len() + added.len());
1484        to_embed.extend(changed.iter().cloned());
1485        to_embed.extend(added.iter().cloned());
1486
1487        if to_embed.is_empty() {
1488            // Only deletions happened.
1489            progress(0, 0);
1490            return Ok(RefreshSummary {
1491                changed: 0,
1492                added: 0,
1493                deleted: deleted.len(),
1494                total_processed,
1495            });
1496        }
1497
1498        let (chunks, fresh_metadata) = Self::collect_chunks(project_root, &to_embed);
1499        let changed_set: HashSet<&Path> = changed.iter().map(PathBuf::as_path).collect();
1500        let vanished = to_embed
1501            .iter()
1502            .filter(|path| {
1503                changed_set.contains(path.as_path())
1504                    && !fresh_metadata.contains_key(*path)
1505                    && !path.exists()
1506            })
1507            .cloned()
1508            .collect::<Vec<_>>();
1509        if !vanished.is_empty() {
1510            self.remove_indexed_files(&vanished);
1511            deleted.extend(vanished);
1512        }
1513
1514        if chunks.is_empty() {
1515            progress(0, 0);
1516            let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1517            for file in &successful_files {
1518                self.deferred_files.remove(file);
1519            }
1520            if !successful_files.is_empty() {
1521                self.entries
1522                    .retain(|entry| !successful_files.contains(&entry.chunk.file));
1523            }
1524            let changed_count = changed
1525                .iter()
1526                .filter(|path| successful_files.contains(*path))
1527                .count();
1528            let added_count = added
1529                .iter()
1530                .filter(|path| successful_files.contains(*path))
1531                .count();
1532            for (file, metadata) in fresh_metadata {
1533                self.file_mtimes.insert(file.clone(), metadata.mtime);
1534                self.file_sizes.insert(file.clone(), metadata.size);
1535                self.file_hashes.insert(file.clone(), metadata.content_hash);
1536            }
1537            return Ok(RefreshSummary {
1538                changed: changed_count,
1539                added: added_count,
1540                deleted: deleted.len(),
1541                total_processed,
1542            });
1543        }
1544
1545        // 4. Embed in batches and dimension-check against the existing index.
1546        let total_chunks = chunks.len();
1547        progress(0, total_chunks);
1548        let batch_size = max_batch_size.max(1);
1549        let existing_dimension = if self.entries.is_empty() {
1550            None
1551        } else {
1552            Some(self.dimension)
1553        };
1554        let mut new_entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1555        let mut observed_dimension: Option<usize> = existing_dimension;
1556
1557        for batch_start in (0..chunks.len()).step_by(batch_size) {
1558            let batch_end = (batch_start + batch_size).min(chunks.len());
1559            let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1560                .iter()
1561                .map(|c| c.embed_text.clone())
1562                .collect();
1563
1564            let vectors = embed_fn(batch_texts)?;
1565            validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1566
1567            if let Some(dim) = vectors.first().map(|v| v.len()) {
1568                match observed_dimension {
1569                    None => observed_dimension = Some(dim),
1570                    Some(expected) if dim != expected => {
1571                        // Refuse to mix dimensions in one index. Caller should
1572                        // fall back to a full rebuild.
1573                        return Err(format!(
1574                            "embedding dimension changed during incremental refresh: \
1575                             cached index uses {expected}, new vectors use {dim}"
1576                        ));
1577                    }
1578                    _ => {}
1579                }
1580            }
1581
1582            for (i, vector) in vectors.into_iter().enumerate() {
1583                let chunk_idx = batch_start + i;
1584                new_entries.push(EmbeddingEntry {
1585                    chunk: chunks[chunk_idx].clone(),
1586                    vector,
1587                });
1588            }
1589
1590            progress(new_entries.len(), total_chunks);
1591        }
1592
1593        let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1594        for file in &successful_files {
1595            self.deferred_files.remove(file);
1596        }
1597        if !successful_files.is_empty() {
1598            self.entries
1599                .retain(|entry| !successful_files.contains(&entry.chunk.file));
1600        }
1601
1602        self.entries.extend(new_entries);
1603        for (file, metadata) in fresh_metadata {
1604            self.file_mtimes.insert(file.clone(), metadata.mtime);
1605            self.file_sizes.insert(file.clone(), metadata.size);
1606            self.file_hashes.insert(file, metadata.content_hash);
1607        }
1608        if let Some(dim) = observed_dimension {
1609            self.dimension = dim;
1610        }
1611
1612        Ok(RefreshSummary {
1613            changed: changed
1614                .iter()
1615                .filter(|path| successful_files.contains(*path))
1616                .count(),
1617            added: added
1618                .iter()
1619                .filter(|path| successful_files.contains(*path))
1620                .count(),
1621            deleted: deleted.len(),
1622            total_processed,
1623        })
1624    }
1625
1626    /// Refresh exactly the files invalidated by the live watcher, without
1627    /// treating the provided path list as the whole project. This is the
1628    /// watcher-side counterpart to `refresh_stale_files`: it drops any stale
1629    /// entries for the requested paths from this in-memory index, re-extracts
1630    /// whatever still exists on disk, embeds those chunks, and returns the
1631    /// delta needed for another in-memory index to apply the same update.
1632    pub fn refresh_invalidated_files<F, P>(
1633        &mut self,
1634        project_root: &Path,
1635        paths: &[PathBuf],
1636        embed_fn: &mut F,
1637        max_batch_size: usize,
1638        max_files: usize,
1639        progress: &mut P,
1640    ) -> Result<InvalidatedFilesRefresh, String>
1641    where
1642        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1643        P: FnMut(usize, usize),
1644    {
1645        self.backfill_missing_file_sizes();
1646
1647        self.deferred_files.retain(|path| path.exists());
1648        let mut requested_paths = paths.to_vec();
1649        requested_paths.extend(self.deferred_files.iter().cloned());
1650        requested_paths.sort();
1651        requested_paths.dedup();
1652        let total_processed = requested_paths.len();
1653
1654        if requested_paths.is_empty() {
1655            progress(0, 0);
1656            return Ok(InvalidatedFilesRefresh {
1657                summary: RefreshSummary {
1658                    total_processed,
1659                    ..RefreshSummary::default()
1660                },
1661                ..InvalidatedFilesRefresh::default()
1662            });
1663        }
1664
1665        let previously_indexed: HashSet<PathBuf> = requested_paths
1666            .iter()
1667            .filter(|path| self.file_mtimes.contains_key(*path))
1668            .cloned()
1669            .collect();
1670
1671        // The watcher path has already invalidated these files in the request
1672        // thread's live index. Mirror that behavior here before inserting any
1673        // fresh chunks so parse/read failures do not resurrect stale entries.
1674        self.remove_indexed_files(&requested_paths);
1675
1676        let existing_paths = requested_paths
1677            .iter()
1678            .filter(|path| path.exists())
1679            .cloned()
1680            .collect::<Vec<_>>();
1681        let deleted = requested_paths
1682            .iter()
1683            .filter(|path| !path.exists() && previously_indexed.contains(path.as_path()))
1684            .count();
1685
1686        if existing_paths.is_empty() {
1687            for path in &requested_paths {
1688                if !path.exists() {
1689                    self.deferred_files.remove(path);
1690                }
1691            }
1692            progress(0, 0);
1693            return Ok(InvalidatedFilesRefresh {
1694                completed_paths: requested_paths,
1695                summary: RefreshSummary {
1696                    deleted,
1697                    total_processed,
1698                    ..RefreshSummary::default()
1699                },
1700                ..InvalidatedFilesRefresh::default()
1701            });
1702        }
1703
1704        let (mut chunks, mut fresh_metadata) = Self::collect_chunks(project_root, &existing_paths);
1705
1706        let retained_file_count = self.file_mtimes.len();
1707        let changed_successful_count = existing_paths
1708            .iter()
1709            .filter(|path| {
1710                previously_indexed.contains(path.as_path()) && fresh_metadata.contains_key(*path)
1711            })
1712            .count();
1713        let available_new_files =
1714            max_files.saturating_sub(retained_file_count.saturating_add(changed_successful_count));
1715        let new_successful_files = existing_paths
1716            .iter()
1717            .filter(|path| {
1718                !previously_indexed.contains(path.as_path()) && fresh_metadata.contains_key(*path)
1719            })
1720            .cloned()
1721            .collect::<Vec<_>>();
1722        if new_successful_files.len() > available_new_files {
1723            let allowed_new_files = new_successful_files
1724                .iter()
1725                .take(available_new_files)
1726                .cloned()
1727                .collect::<HashSet<_>>();
1728            let deferred_new_files = new_successful_files
1729                .into_iter()
1730                .filter(|path| !allowed_new_files.contains(path))
1731                .collect::<HashSet<_>>();
1732
1733            fresh_metadata.retain(|file, _| {
1734                previously_indexed.contains(file.as_path()) || allowed_new_files.contains(file)
1735            });
1736            chunks.retain(|chunk| !deferred_new_files.contains(&chunk.file));
1737
1738            if !deferred_new_files.is_empty() {
1739                for path in &deferred_new_files {
1740                    self.deferred_files.insert(path.clone());
1741                }
1742                slog_warn!(
1743                    "semantic refresh deferred {} new file(s): indexed-file cap {} is reached",
1744                    deferred_new_files.len(),
1745                    max_files
1746                );
1747            }
1748        }
1749
1750        let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1751        for file in &successful_files {
1752            self.deferred_files.remove(file);
1753        }
1754        let changed = successful_files
1755            .iter()
1756            .filter(|path| previously_indexed.contains(path.as_path()))
1757            .count();
1758        let added = successful_files.len().saturating_sub(changed);
1759        let mut updated_metadata = Vec::with_capacity(fresh_metadata.len());
1760
1761        if chunks.is_empty() {
1762            progress(0, 0);
1763            for (file, metadata) in fresh_metadata {
1764                let freshness = FileFreshness {
1765                    mtime: metadata.mtime,
1766                    size: metadata.size,
1767                    content_hash: metadata.content_hash,
1768                };
1769                self.file_mtimes.insert(file.clone(), freshness.mtime);
1770                self.file_sizes.insert(file.clone(), freshness.size);
1771                self.file_hashes
1772                    .insert(file.clone(), freshness.content_hash);
1773                updated_metadata.push((file, freshness));
1774            }
1775
1776            return Ok(InvalidatedFilesRefresh {
1777                updated_metadata,
1778                completed_paths: requested_paths,
1779                summary: RefreshSummary {
1780                    changed,
1781                    added,
1782                    deleted,
1783                    total_processed,
1784                },
1785                ..InvalidatedFilesRefresh::default()
1786            });
1787        }
1788
1789        let total_chunks = chunks.len();
1790        progress(0, total_chunks);
1791        let batch_size = max_batch_size.max(1);
1792        let mut observed_dimension = if self.entries.is_empty() && previously_indexed.is_empty() {
1793            None
1794        } else {
1795            Some(self.dimension)
1796        };
1797        let mut new_entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1798
1799        for batch_start in (0..chunks.len()).step_by(batch_size) {
1800            let batch_end = (batch_start + batch_size).min(chunks.len());
1801            let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1802                .iter()
1803                .map(|chunk| chunk.embed_text.clone())
1804                .collect();
1805
1806            let vectors = embed_fn(batch_texts)?;
1807            validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1808
1809            if let Some(dim) = vectors.first().map(|vector| vector.len()) {
1810                match observed_dimension {
1811                    None => observed_dimension = Some(dim),
1812                    Some(expected) if dim != expected => {
1813                        return Err(format!(
1814                            "embedding dimension changed during invalidated-file refresh: \
1815                             cached index uses {expected}, new vectors use {dim}"
1816                        ));
1817                    }
1818                    _ => {}
1819                }
1820            }
1821
1822            for (i, vector) in vectors.into_iter().enumerate() {
1823                let chunk_idx = batch_start + i;
1824                new_entries.push(EmbeddingEntry {
1825                    chunk: chunks[chunk_idx].clone(),
1826                    vector,
1827                });
1828            }
1829
1830            progress(new_entries.len(), total_chunks);
1831        }
1832
1833        let added_entries = new_entries.clone();
1834        self.entries.extend(new_entries);
1835        for (file, metadata) in fresh_metadata {
1836            let freshness = FileFreshness {
1837                mtime: metadata.mtime,
1838                size: metadata.size,
1839                content_hash: metadata.content_hash,
1840            };
1841            self.file_mtimes.insert(file.clone(), freshness.mtime);
1842            self.file_sizes.insert(file.clone(), freshness.size);
1843            self.file_hashes
1844                .insert(file.clone(), freshness.content_hash);
1845            updated_metadata.push((file, freshness));
1846        }
1847        if let Some(dim) = observed_dimension {
1848            self.dimension = dim;
1849        }
1850
1851        Ok(InvalidatedFilesRefresh {
1852            added_entries,
1853            updated_metadata,
1854            completed_paths: requested_paths,
1855            summary: RefreshSummary {
1856                changed,
1857                added,
1858                deleted,
1859                total_processed,
1860            },
1861        })
1862    }
1863
1864    pub fn apply_refresh_update(
1865        &mut self,
1866        added_entries: Vec<EmbeddingEntry>,
1867        updated_metadata: Vec<(PathBuf, FileFreshness)>,
1868        completed_paths: &[PathBuf],
1869    ) {
1870        self.remove_indexed_files(completed_paths);
1871
1872        let observed_dimension = added_entries.first().map(|entry| entry.vector.len());
1873        self.entries.extend(added_entries);
1874        for (file, freshness) in updated_metadata {
1875            self.file_mtimes.insert(file.clone(), freshness.mtime);
1876            self.file_sizes.insert(file.clone(), freshness.size);
1877            self.file_hashes.insert(file, freshness.content_hash);
1878        }
1879        if let Some(dim) = observed_dimension {
1880            self.dimension = dim;
1881        }
1882    }
1883
1884    fn remove_indexed_files(&mut self, files: &[PathBuf]) {
1885        let deleted_set: HashSet<&Path> = files.iter().map(PathBuf::as_path).collect();
1886        self.entries
1887            .retain(|entry| !deleted_set.contains(entry.chunk.file.as_path()));
1888        for path in files {
1889            self.file_mtimes.remove(path);
1890            self.file_sizes.remove(path);
1891            self.file_hashes.remove(path);
1892        }
1893    }
1894
1895    /// Search the index with a query embedding, returning top-K results sorted by relevance
1896    pub fn search(&self, query_vector: &[f32], top_k: usize) -> Vec<SemanticResult> {
1897        if self.entries.is_empty() || query_vector.len() != self.dimension {
1898            return Vec::new();
1899        }
1900
1901        let mut scored: Vec<(f32, usize)> = self
1902            .entries
1903            .iter()
1904            .enumerate()
1905            .map(|(i, entry)| {
1906                let mut score = cosine_similarity(query_vector, &entry.vector);
1907                if entry.chunk.exported {
1908                    score *= 1.1;
1909                }
1910                (score, i)
1911            })
1912            .collect();
1913
1914        // Sort descending by score
1915        scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
1916
1917        scored
1918            .into_iter()
1919            .take(top_k)
1920            // Keep the sort → take → map ordering explicit: removing the old
1921            // `> 0.0` floor cannot evict positive hits because top_k has already
1922            // been selected, but it can surface zero-score noise in the tail.
1923            .map(|(score, idx)| {
1924                let entry = &self.entries[idx];
1925                SemanticResult {
1926                    file: entry.chunk.file.clone(),
1927                    name: entry.chunk.name.clone(),
1928                    kind: entry.chunk.kind.clone(),
1929                    start_line: entry.chunk.start_line,
1930                    end_line: entry.chunk.end_line,
1931                    exported: entry.chunk.exported,
1932                    snippet: entry.chunk.snippet.clone(),
1933                    score,
1934                    source: "semantic",
1935                }
1936            })
1937            .collect()
1938    }
1939
1940    /// Number of indexed entries
1941    pub fn len(&self) -> usize {
1942        self.entries.len()
1943    }
1944
1945    /// Check if a file needs re-indexing based on mtime/size
1946    pub fn is_file_stale(&self, file: &Path) -> bool {
1947        let Some(stored_mtime) = self.file_mtimes.get(file) else {
1948            return true;
1949        };
1950        let Some(stored_size) = self.file_sizes.get(file) else {
1951            return true;
1952        };
1953        let Some(stored_hash) = self.file_hashes.get(file) else {
1954            return true;
1955        };
1956        let cached = FileFreshness {
1957            mtime: *stored_mtime,
1958            size: *stored_size,
1959            content_hash: *stored_hash,
1960        };
1961        match cache_freshness::verify_file_strict(file, &cached) {
1962            FreshnessVerdict::HotFresh => false,
1963            FreshnessVerdict::ContentFresh { .. } => false,
1964            FreshnessVerdict::Stale | FreshnessVerdict::Deleted => true,
1965        }
1966    }
1967
1968    fn backfill_missing_file_sizes(&mut self) {
1969        for path in self.file_mtimes.keys() {
1970            if self.file_sizes.contains_key(path) {
1971                continue;
1972            }
1973            if let Ok(metadata) = fs::metadata(path) {
1974                self.file_sizes.insert(path.clone(), metadata.len());
1975                if let Ok(Some(hash)) = cache_freshness::hash_file_if_small(path, metadata.len()) {
1976                    self.file_hashes.insert(path.clone(), hash);
1977                }
1978            }
1979        }
1980    }
1981
1982    /// Remove entries for a specific file
1983    pub fn remove_file(&mut self, file: &Path) {
1984        self.invalidate_file(file);
1985    }
1986
1987    pub fn invalidate_file(&mut self, file: &Path) {
1988        let canonical_file = canonicalize_existing_or_deleted_path(file);
1989        self.entries
1990            .retain(|e| e.chunk.file != file && e.chunk.file != canonical_file);
1991        self.file_mtimes.remove(file);
1992        self.file_sizes.remove(file);
1993        self.file_hashes.remove(file);
1994        if canonical_file.as_path() != file {
1995            self.file_mtimes.remove(&canonical_file);
1996            self.file_sizes.remove(&canonical_file);
1997            self.file_hashes.remove(&canonical_file);
1998        }
1999    }
2000
2001    /// Get the embedding dimension
2002    pub fn dimension(&self) -> usize {
2003        self.dimension
2004    }
2005
2006    pub fn fingerprint(&self) -> Option<&SemanticIndexFingerprint> {
2007        self.fingerprint.as_ref()
2008    }
2009
2010    pub fn backend_label(&self) -> Option<&str> {
2011        self.fingerprint.as_ref().map(|f| f.backend.as_str())
2012    }
2013
2014    pub fn model_label(&self) -> Option<&str> {
2015        self.fingerprint.as_ref().map(|f| f.model.as_str())
2016    }
2017
2018    pub fn set_fingerprint(&mut self, fingerprint: SemanticIndexFingerprint) {
2019        self.fingerprint = Some(fingerprint);
2020    }
2021
2022    /// Write the semantic index to disk using atomic temp+rename pattern
2023    pub fn write_to_disk(&self, storage_dir: &Path, project_key: &str) {
2024        // Don't persist empty indexes — they would be loaded on next startup
2025        // and prevent a fresh build that might find files.
2026        if self.entries.is_empty() {
2027            slog_info!("skipping semantic index persistence (0 entries)");
2028            return;
2029        }
2030        let dir = storage_dir.join("semantic").join(project_key);
2031        if let Err(e) = fs::create_dir_all(&dir) {
2032            slog_warn!("failed to create semantic cache dir: {}", e);
2033            return;
2034        }
2035        let data_path = dir.join("semantic.bin");
2036        let tmp_path = dir.join(format!(
2037            "semantic.bin.tmp.{}.{}",
2038            std::process::id(),
2039            SystemTime::now()
2040                .duration_since(SystemTime::UNIX_EPOCH)
2041                .unwrap_or(Duration::ZERO)
2042                .as_nanos()
2043        ));
2044        let bytes = self.to_bytes();
2045        let write_result = (|| -> std::io::Result<()> {
2046            use std::io::Write;
2047            let mut file = fs::File::create(&tmp_path)?;
2048            file.write_all(&bytes)?;
2049            file.sync_all()?;
2050            Ok(())
2051        })();
2052        if let Err(e) = write_result {
2053            slog_warn!("failed to write semantic index: {}", e);
2054            let _ = fs::remove_file(&tmp_path);
2055            return;
2056        }
2057        if let Err(e) = fs::rename(&tmp_path, &data_path) {
2058            slog_warn!("failed to rename semantic index: {}", e);
2059            let _ = fs::remove_file(&tmp_path);
2060            return;
2061        }
2062        slog_info!(
2063            "semantic index persisted: {} entries, {:.1} KB",
2064            self.entries.len(),
2065            bytes.len() as f64 / 1024.0
2066        );
2067    }
2068
2069    /// Read the semantic index from disk
2070    pub fn read_from_disk(
2071        storage_dir: &Path,
2072        project_key: &str,
2073        current_canonical_root: &Path,
2074        is_worktree_bridge: bool,
2075        expected_fingerprint: Option<&str>,
2076    ) -> Option<Self> {
2077        debug_assert!(current_canonical_root.is_absolute());
2078        let data_path = storage_dir
2079            .join("semantic")
2080            .join(project_key)
2081            .join("semantic.bin");
2082        let file_len = usize::try_from(fs::metadata(&data_path).ok()?.len()).ok()?;
2083        if file_len < HEADER_BYTES_V1 {
2084            slog_warn!(
2085                "corrupt semantic index (too small: {} bytes), removing",
2086                file_len
2087            );
2088            if !is_worktree_bridge {
2089                let _ = fs::remove_file(&data_path);
2090            }
2091            return None;
2092        }
2093
2094        let bytes = fs::read(&data_path).ok()?;
2095        let version = bytes[0];
2096        if version != SEMANTIC_INDEX_VERSION_V6 {
2097            slog_info!(
2098                "cached semantic index version {} is older than {}, rebuilding",
2099                version,
2100                SEMANTIC_INDEX_VERSION_V6
2101            );
2102            if !is_worktree_bridge {
2103                let _ = fs::remove_file(&data_path);
2104            }
2105            return None;
2106        }
2107        match Self::from_bytes(&bytes, current_canonical_root) {
2108            Ok(index) => {
2109                if index.entries.is_empty() {
2110                    slog_info!("cached semantic index is empty, will rebuild");
2111                    if !is_worktree_bridge {
2112                        let _ = fs::remove_file(&data_path);
2113                    }
2114                    return None;
2115                }
2116                if let Some(expected) = expected_fingerprint {
2117                    let matches = index
2118                        .fingerprint()
2119                        .map(|fingerprint| fingerprint.matches_expected(expected))
2120                        .unwrap_or(false);
2121                    if !matches {
2122                        slog_info!("cached semantic index fingerprint mismatch, rebuilding");
2123                        if !is_worktree_bridge {
2124                            let _ = fs::remove_file(&data_path);
2125                        }
2126                        return None;
2127                    }
2128                }
2129                slog_info!(
2130                    "loaded semantic index from disk: {} entries",
2131                    index.entries.len()
2132                );
2133                Some(index)
2134            }
2135            Err(e) => {
2136                slog_warn!("corrupt semantic index, rebuilding: {}", e);
2137                if !is_worktree_bridge {
2138                    let _ = fs::remove_file(&data_path);
2139                }
2140                None
2141            }
2142        }
2143    }
2144
2145    /// Serialize the index to bytes for disk persistence
2146    pub fn to_bytes(&self) -> Vec<u8> {
2147        let mut buf = Vec::new();
2148        let fingerprint_bytes = self.fingerprint.as_ref().and_then(|fingerprint| {
2149            let encoded = fingerprint.as_string();
2150            if encoded.is_empty() {
2151                None
2152            } else {
2153                Some(encoded.into_bytes())
2154            }
2155        });
2156        let file_mtimes: Vec<_> = self
2157            .file_mtimes
2158            .iter()
2159            .filter_map(|(path, mtime)| {
2160                cache_relative_path(&self.project_root, path)
2161                    .map(|relative| (relative, path, mtime))
2162            })
2163            .collect();
2164        let entries: Vec<_> = self
2165            .entries
2166            .iter()
2167            .filter_map(|entry| {
2168                cache_relative_path(&self.project_root, &entry.chunk.file)
2169                    .map(|relative| (relative, entry))
2170            })
2171            .collect();
2172
2173        // Header: version(1) + dimension(4) + entry_count(4) + fingerprint_len(4) + fingerprint
2174        //
2175        // V6 is the single write format. Layout extends V5:
2176        //   - fingerprint is always represented (absent ⇒ fingerprint_len=0,
2177        //     no bytes follow). Uniform format simplifies the reader.
2178        //   - paths are relative to project_root.
2179        //   - file metadata stored as secs(u64) + subsec_nanos(u32) + size(u64) + blake3(32).
2180        //     Preserves full APFS/ext4/NTFS precision and catches mtime ties.
2181        //
2182        // V1/V2 remain readable for backward compatibility (see from_bytes).
2183        // V3/V4 load as compatible formats but are rejected on disk so snippets
2184        // and file sizes are rebuilt once.
2185        let version = SEMANTIC_INDEX_VERSION_V6;
2186        buf.push(version);
2187        buf.extend_from_slice(&(self.dimension as u32).to_le_bytes());
2188        buf.extend_from_slice(&(entries.len() as u32).to_le_bytes());
2189        let fp_bytes_ref: &[u8] = fingerprint_bytes.as_deref().unwrap_or(&[]);
2190        buf.extend_from_slice(&(fp_bytes_ref.len() as u32).to_le_bytes());
2191        buf.extend_from_slice(fp_bytes_ref);
2192
2193        // File mtime table: count(4) + entries
2194        // V3 layout per entry: path_len(4) + path + secs(8) + subsec_nanos(4)
2195        buf.extend_from_slice(&(file_mtimes.len() as u32).to_le_bytes());
2196        for (relative, path, mtime) in &file_mtimes {
2197            let path_bytes = relative.to_string_lossy().as_bytes().to_vec();
2198            buf.extend_from_slice(&(path_bytes.len() as u32).to_le_bytes());
2199            buf.extend_from_slice(&path_bytes);
2200            let duration = mtime
2201                .duration_since(SystemTime::UNIX_EPOCH)
2202                .unwrap_or_default();
2203            buf.extend_from_slice(&duration.as_secs().to_le_bytes());
2204            buf.extend_from_slice(&duration.subsec_nanos().to_le_bytes());
2205            let size = self.file_sizes.get(*path).copied().unwrap_or_default();
2206            buf.extend_from_slice(&size.to_le_bytes());
2207            let hash = self
2208                .file_hashes
2209                .get(*path)
2210                .copied()
2211                .unwrap_or_else(cache_freshness::zero_hash);
2212            buf.extend_from_slice(hash.as_bytes());
2213        }
2214
2215        // Entries: each is metadata + vector
2216        for (relative, entry) in &entries {
2217            let c = &entry.chunk;
2218
2219            // File path
2220            let file_bytes = relative.to_string_lossy().as_bytes().to_vec();
2221            buf.extend_from_slice(&(file_bytes.len() as u32).to_le_bytes());
2222            buf.extend_from_slice(&file_bytes);
2223
2224            // Name
2225            let name_bytes = c.name.as_bytes();
2226            buf.extend_from_slice(&(name_bytes.len() as u32).to_le_bytes());
2227            buf.extend_from_slice(name_bytes);
2228
2229            // Kind (1 byte)
2230            buf.push(symbol_kind_to_u8(&c.kind));
2231
2232            // Lines + exported
2233            buf.extend_from_slice(&(c.start_line as u32).to_le_bytes());
2234            buf.extend_from_slice(&(c.end_line as u32).to_le_bytes());
2235            buf.push(c.exported as u8);
2236
2237            // Snippet
2238            let snippet_bytes = c.snippet.as_bytes();
2239            buf.extend_from_slice(&(snippet_bytes.len() as u32).to_le_bytes());
2240            buf.extend_from_slice(snippet_bytes);
2241
2242            // Embed text
2243            let embed_bytes = c.embed_text.as_bytes();
2244            buf.extend_from_slice(&(embed_bytes.len() as u32).to_le_bytes());
2245            buf.extend_from_slice(embed_bytes);
2246
2247            // Vector (f32 array)
2248            for &val in &entry.vector {
2249                buf.extend_from_slice(&val.to_le_bytes());
2250            }
2251        }
2252
2253        buf
2254    }
2255
2256    /// Deserialize the index from bytes
2257    pub fn from_bytes(data: &[u8], current_canonical_root: &Path) -> Result<Self, String> {
2258        debug_assert!(current_canonical_root.is_absolute());
2259        let mut pos = 0;
2260
2261        if data.len() < HEADER_BYTES_V1 {
2262            return Err("data too short".to_string());
2263        }
2264
2265        let version = data[pos];
2266        pos += 1;
2267        if version != SEMANTIC_INDEX_VERSION_V1
2268            && version != SEMANTIC_INDEX_VERSION_V2
2269            && version != SEMANTIC_INDEX_VERSION_V3
2270            && version != SEMANTIC_INDEX_VERSION_V4
2271            && version != SEMANTIC_INDEX_VERSION_V5
2272            && version != SEMANTIC_INDEX_VERSION_V6
2273        {
2274            return Err(format!("unsupported version: {}", version));
2275        }
2276        // V2 and newer share the same header layout (V3/V4/V5 only differ from
2277        // V2 in the per-mtime entry layout): version(1) + dimension(4) +
2278        // entry_count(4) + fingerprint_len(4) + fingerprint bytes.
2279        if (version == SEMANTIC_INDEX_VERSION_V2
2280            || version == SEMANTIC_INDEX_VERSION_V3
2281            || version == SEMANTIC_INDEX_VERSION_V4
2282            || version == SEMANTIC_INDEX_VERSION_V5
2283            || version == SEMANTIC_INDEX_VERSION_V6)
2284            && data.len() < HEADER_BYTES_V2
2285        {
2286            return Err("data too short for semantic index v2/v3/v4/v5/v6 header".to_string());
2287        }
2288
2289        let dimension = read_u32(data, &mut pos)? as usize;
2290        let entry_count = read_u32(data, &mut pos)? as usize;
2291        validate_embedding_dimension(dimension)?;
2292        if entry_count > MAX_ENTRIES {
2293            return Err(format!("too many semantic index entries: {}", entry_count));
2294        }
2295
2296        // Fingerprint handling:
2297        //   - V1: no fingerprint field at all.
2298        //   - V2: fingerprint_len + fingerprint bytes; always present (writer
2299        //     only emitted V2 when fingerprint was Some).
2300        //   - V3+: fingerprint_len always present; fingerprint_len==0 ⇒ None.
2301        let has_fingerprint_field = version == SEMANTIC_INDEX_VERSION_V2
2302            || version == SEMANTIC_INDEX_VERSION_V3
2303            || version == SEMANTIC_INDEX_VERSION_V4
2304            || version == SEMANTIC_INDEX_VERSION_V5
2305            || version == SEMANTIC_INDEX_VERSION_V6;
2306        let fingerprint = if has_fingerprint_field {
2307            let fingerprint_len = read_u32(data, &mut pos)? as usize;
2308            if pos + fingerprint_len > data.len() {
2309                return Err("unexpected end of data reading fingerprint".to_string());
2310            }
2311            if fingerprint_len == 0 {
2312                None
2313            } else {
2314                let raw = String::from_utf8_lossy(&data[pos..pos + fingerprint_len]).to_string();
2315                pos += fingerprint_len;
2316                Some(
2317                    serde_json::from_str::<SemanticIndexFingerprint>(&raw)
2318                        .map_err(|error| format!("invalid semantic fingerprint: {error}"))?,
2319                )
2320            }
2321        } else {
2322            None
2323        };
2324
2325        // File mtimes
2326        let mtime_count = read_u32(data, &mut pos)? as usize;
2327        if mtime_count > MAX_ENTRIES {
2328            return Err(format!("too many semantic file mtimes: {}", mtime_count));
2329        }
2330
2331        let vector_bytes = entry_count
2332            .checked_mul(dimension)
2333            .and_then(|count| count.checked_mul(F32_BYTES))
2334            .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
2335        if vector_bytes > data.len().saturating_sub(pos) {
2336            return Err("semantic index vectors exceed available data".to_string());
2337        }
2338
2339        let mut file_mtimes = HashMap::with_capacity(mtime_count);
2340        let mut file_sizes = HashMap::with_capacity(mtime_count);
2341        let mut file_hashes = HashMap::with_capacity(mtime_count);
2342        for _ in 0..mtime_count {
2343            let path = read_string(data, &mut pos)?;
2344            let secs = read_u64(data, &mut pos)?;
2345            // V3+ persists subsec_nanos alongside secs so staleness checks
2346            // survive restart round-trips. V1/V2 load with 0 nanos, which
2347            // causes one rebuild on upgrade (they never matched live APFS
2348            // mtimes anyway — the bug v0.15.2 fixes). After that rebuild,
2349            // the cache is persisted as V3 and stabilises.
2350            let nanos = if version == SEMANTIC_INDEX_VERSION_V3
2351                || version == SEMANTIC_INDEX_VERSION_V4
2352                || version == SEMANTIC_INDEX_VERSION_V5
2353                || version == SEMANTIC_INDEX_VERSION_V6
2354            {
2355                read_u32(data, &mut pos)?
2356            } else {
2357                0
2358            };
2359            let size =
2360                if version == SEMANTIC_INDEX_VERSION_V5 || version == SEMANTIC_INDEX_VERSION_V6 {
2361                    read_u64(data, &mut pos)?
2362                } else {
2363                    0
2364                };
2365            let content_hash = if version == SEMANTIC_INDEX_VERSION_V6 {
2366                if pos + 32 > data.len() {
2367                    return Err("unexpected end of data reading content hash".to_string());
2368                }
2369                let mut hash_bytes = [0u8; 32];
2370                hash_bytes.copy_from_slice(&data[pos..pos + 32]);
2371                pos += 32;
2372                blake3::Hash::from_bytes(hash_bytes)
2373            } else {
2374                cache_freshness::zero_hash()
2375            };
2376            // Hardening against corrupt / maliciously crafted cache files
2377            // (v0.15.2). `Duration::new(secs, nanos)` can panic when the
2378            // nanosecond carry overflows the second counter, and
2379            // `SystemTime + Duration` can panic on carry past the platform's
2380            // upper bound. Explicit validation keeps a corrupted semantic.bin
2381            // from taking down the whole aft process.
2382            if nanos >= 1_000_000_000 {
2383                return Err(format!(
2384                    "invalid semantic mtime: nanos {} >= 1_000_000_000",
2385                    nanos
2386                ));
2387            }
2388            let duration = std::time::Duration::new(secs, nanos);
2389            let mtime = SystemTime::UNIX_EPOCH
2390                .checked_add(duration)
2391                .ok_or_else(|| {
2392                    format!(
2393                        "invalid semantic mtime: secs={} nanos={} overflows SystemTime",
2394                        secs, nanos
2395                    )
2396                })?;
2397            let path = if version == SEMANTIC_INDEX_VERSION_V6 {
2398                cached_path_under_root(current_canonical_root, &PathBuf::from(path))
2399                    .ok_or_else(|| "cached semantic mtime path escapes project root".to_string())?
2400            } else {
2401                PathBuf::from(path)
2402            };
2403            file_mtimes.insert(path.clone(), mtime);
2404            file_sizes.insert(path.clone(), size);
2405            file_hashes.insert(path, content_hash);
2406        }
2407
2408        // Entries
2409        let mut entries = Vec::with_capacity(entry_count);
2410        for _ in 0..entry_count {
2411            let raw_file = PathBuf::from(read_string(data, &mut pos)?);
2412            let file = if version == SEMANTIC_INDEX_VERSION_V6 {
2413                cached_path_under_root(current_canonical_root, &raw_file)
2414                    .ok_or_else(|| "cached semantic entry path escapes project root".to_string())?
2415            } else {
2416                raw_file
2417            };
2418            let name = read_string(data, &mut pos)?;
2419
2420            if pos >= data.len() {
2421                return Err("unexpected end of data".to_string());
2422            }
2423            let kind = u8_to_symbol_kind(data[pos]);
2424            pos += 1;
2425
2426            let start_line = read_u32(data, &mut pos)?;
2427            let end_line = read_u32(data, &mut pos)?;
2428
2429            if pos >= data.len() {
2430                return Err("unexpected end of data".to_string());
2431            }
2432            let exported = data[pos] != 0;
2433            pos += 1;
2434
2435            let snippet = read_string(data, &mut pos)?;
2436            let embed_text = read_string(data, &mut pos)?;
2437
2438            // Vector
2439            let vec_bytes = dimension
2440                .checked_mul(F32_BYTES)
2441                .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
2442            if pos + vec_bytes > data.len() {
2443                return Err("unexpected end of data reading vector".to_string());
2444            }
2445            let mut vector = Vec::with_capacity(dimension);
2446            for _ in 0..dimension {
2447                let bytes = [data[pos], data[pos + 1], data[pos + 2], data[pos + 3]];
2448                vector.push(f32::from_le_bytes(bytes));
2449                pos += 4;
2450            }
2451
2452            entries.push(EmbeddingEntry {
2453                chunk: SemanticChunk {
2454                    file,
2455                    name,
2456                    kind,
2457                    start_line,
2458                    end_line,
2459                    exported,
2460                    embed_text,
2461                    snippet,
2462                },
2463                vector,
2464            });
2465        }
2466
2467        if entries.len() != entry_count {
2468            return Err(format!(
2469                "semantic cache entry count drift: header={} decoded={}",
2470                entry_count,
2471                entries.len()
2472            ));
2473        }
2474        for entry in &entries {
2475            if !file_mtimes.contains_key(&entry.chunk.file) {
2476                return Err(format!(
2477                    "semantic cache metadata missing for entry file {}",
2478                    entry.chunk.file.display()
2479                ));
2480            }
2481        }
2482
2483        Ok(Self {
2484            entries,
2485            file_mtimes,
2486            file_sizes,
2487            file_hashes,
2488            dimension,
2489            fingerprint,
2490            project_root: current_canonical_root.to_path_buf(),
2491            deferred_files: HashSet::new(),
2492        })
2493    }
2494}
2495
2496/// Build enriched embedding text from a symbol with cAST-style context
2497fn build_embed_text(symbol: &Symbol, source: &str, file: &Path, project_root: &Path) -> String {
2498    let relative = file
2499        .strip_prefix(project_root)
2500        .unwrap_or(file)
2501        .to_string_lossy();
2502
2503    let kind_label = match &symbol.kind {
2504        SymbolKind::Function => "function",
2505        SymbolKind::Class => "class",
2506        SymbolKind::Method => "method",
2507        SymbolKind::Struct => "struct",
2508        SymbolKind::Interface => "interface",
2509        SymbolKind::Enum => "enum",
2510        SymbolKind::TypeAlias => "type",
2511        SymbolKind::Variable => "variable",
2512        SymbolKind::Heading => "heading",
2513        SymbolKind::FileSummary => "file-summary",
2514    };
2515
2516    // Build: "file:relative/path kind:function name:validateAuth signature:fn validateAuth(token: &str) -> bool"
2517    let name = &symbol.name;
2518    let mut text = format!(
2519        "name:{name} file:{} kind:{} name:{name}",
2520        relative, kind_label
2521    );
2522
2523    if let Some(sig) = &symbol.signature {
2524        // Cap the signature: structured parsers (e.g. YAML/Kubernetes) pack
2525        // entire inline scripts (CronJob/Job `command:` bodies, multi-KB) into
2526        // the signature. Appending it unbounded produces a single embed_text
2527        // that overflows the embedding backend's physical batch (e.g. a
2528        // llama.cpp server's 512-token cap), aborting the whole index build
2529        // and silently degrading every search to lexical. 400 chars keeps the
2530        // identifying head of the signature without blowing the budget.
2531        text.push_str(&format!(" signature:{}", truncate_chars(sig, 400)));
2532    }
2533
2534    // Add body snippet (first ~300 chars of symbol body)
2535    let lines: Vec<&str> = source.lines().collect();
2536    let start = (symbol.range.start_line as usize).min(lines.len());
2537    // range.end_line is inclusive 0-based; +1 makes it an exclusive slice bound.
2538    let end = (symbol.range.end_line as usize + 1).min(lines.len());
2539    if start < end {
2540        let body: String = lines[start..end]
2541            .iter()
2542            .take(15) // max 15 lines
2543            .copied()
2544            .collect::<Vec<&str>>()
2545            .join("\n");
2546        let snippet = if body.len() > 300 {
2547            format!("{}...", &body[..body.floor_char_boundary(300)])
2548        } else {
2549            body
2550        };
2551        text.push_str(&format!(" body:{}", snippet));
2552    }
2553
2554    // Final defense-in-depth clamp: no single embed_text may exceed the
2555    // backend's per-input budget regardless of which field grew. Most
2556    // backends cap a physical batch around 512 tokens; ~1600 chars stays
2557    // comfortably under that for typical English/code (≈4 chars/token).
2558    truncate_chars(&text, MAX_EMBED_TEXT_CHARS)
2559}
2560
2561/// Upper bound on characters in a single chunk's `embed_text`. Keeps any one
2562/// input below typical embedding-backend physical batch limits (~512 tokens)
2563/// so an oversized symbol cannot abort the whole index build.
2564const MAX_EMBED_TEXT_CHARS: usize = 1600;
2565
2566fn truncate_chars(value: &str, max_chars: usize) -> String {
2567    value.chars().take(max_chars).collect()
2568}
2569
2570fn first_leading_doc_comment(source: &str) -> String {
2571    let lines: Vec<&str> = source.lines().collect();
2572    let Some((start, first)) = lines
2573        .iter()
2574        .enumerate()
2575        .find(|(_, line)| !line.trim().is_empty())
2576    else {
2577        return String::new();
2578    };
2579
2580    let trimmed = first.trim_start();
2581    if trimmed.starts_with("/**") {
2582        let mut comment = Vec::new();
2583        for line in lines.iter().skip(start) {
2584            comment.push(*line);
2585            if line.contains("*/") {
2586                break;
2587            }
2588        }
2589        return truncate_chars(&comment.join("\n"), 200);
2590    }
2591
2592    if trimmed.starts_with("///") || trimmed.starts_with("//!") {
2593        let comment = lines
2594            .iter()
2595            .skip(start)
2596            .take_while(|line| {
2597                let trimmed = line.trim_start();
2598                trimmed.starts_with("///") || trimmed.starts_with("//!")
2599            })
2600            .copied()
2601            .collect::<Vec<_>>()
2602            .join("\n");
2603        return truncate_chars(&comment, 200);
2604    }
2605
2606    String::new()
2607}
2608
2609pub fn build_file_summary_chunk(
2610    file: &Path,
2611    project_root: &Path,
2612    source: &str,
2613    top_exports: &[&str],
2614    top_export_signatures: &[Option<&str>],
2615) -> SemanticChunk {
2616    let relative = file.strip_prefix(project_root).unwrap_or(file);
2617    let rel_path = relative.to_string_lossy();
2618    let parent_dir = relative
2619        .parent()
2620        .map(|parent| parent.to_string_lossy().to_string())
2621        .unwrap_or_default();
2622    let name = file
2623        .file_stem()
2624        .map(|stem| stem.to_string_lossy().to_string())
2625        .unwrap_or_default();
2626    let doc = first_leading_doc_comment(source);
2627    let exports = top_exports
2628        .iter()
2629        .take(5)
2630        .copied()
2631        .collect::<Vec<_>>()
2632        .join(",");
2633    let snippet = if doc.is_empty() {
2634        top_export_signatures
2635            .first()
2636            .and_then(|signature| signature.as_deref())
2637            .map(|signature| truncate_chars(signature, 200))
2638            .unwrap_or_default()
2639    } else {
2640        doc.clone()
2641    };
2642
2643    SemanticChunk {
2644        file: file.to_path_buf(),
2645        name,
2646        kind: SymbolKind::FileSummary,
2647        start_line: 0,
2648        end_line: 0,
2649        exported: false,
2650        embed_text: truncate_chars(
2651            &format!(
2652                "file:{rel_path} kind:file-summary name:{} parent:{parent_dir} doc:{doc} exports:{exports}",
2653                file.file_stem()
2654                    .map(|stem| stem.to_string_lossy().to_string())
2655                    .unwrap_or_default()
2656            ),
2657            MAX_EMBED_TEXT_CHARS,
2658        ),
2659        snippet,
2660    }
2661}
2662
2663fn parser_for(
2664    parsers: &mut HashMap<crate::parser::LangId, Parser>,
2665    lang: crate::parser::LangId,
2666) -> Result<&mut Parser, String> {
2667    use std::collections::hash_map::Entry;
2668
2669    match parsers.entry(lang) {
2670        Entry::Occupied(entry) => Ok(entry.into_mut()),
2671        Entry::Vacant(entry) => {
2672            let grammar = grammar_for(lang);
2673            let mut parser = Parser::new();
2674            parser
2675                .set_language(&grammar)
2676                .map_err(|error| error.to_string())?;
2677            Ok(entry.insert(parser))
2678        }
2679    }
2680}
2681
2682pub fn is_semantic_indexed_extension(path: &Path) -> bool {
2683    matches!(
2684        path.extension().and_then(|extension| extension.to_str()),
2685        Some(
2686            "ts" | "tsx"
2687                | "js"
2688                | "jsx"
2689                | "py"
2690                | "rs"
2691                | "go"
2692                | "c"
2693                | "h"
2694                | "cc"
2695                | "cpp"
2696                | "cxx"
2697                | "hpp"
2698                | "hh"
2699                | "zig"
2700                | "cs"
2701                | "sh"
2702                | "bash"
2703                | "zsh"
2704                | "inc"
2705                | "php"
2706                | "sol"
2707                | "scss"
2708                | "vue"
2709                | "yaml"
2710                | "yml"
2711        )
2712    )
2713}
2714
2715fn collect_file_metadata(file: &Path) -> Result<IndexedFileMetadata, String> {
2716    let metadata = fs::metadata(file).map_err(|error| error.to_string())?;
2717    let mtime = metadata.modified().map_err(|error| error.to_string())?;
2718    let content_hash = cache_freshness::hash_file_if_small(file, metadata.len())
2719        .map_err(|error| error.to_string())?
2720        .unwrap_or_else(cache_freshness::zero_hash);
2721    Ok(IndexedFileMetadata {
2722        mtime,
2723        size: metadata.len(),
2724        content_hash,
2725    })
2726}
2727
2728fn canonicalize_existing_or_deleted_path(path: &Path) -> PathBuf {
2729    if let Ok(canonical) = fs::canonicalize(path) {
2730        return canonical;
2731    }
2732
2733    let Some(parent) = path.parent() else {
2734        return path.to_path_buf();
2735    };
2736    let Some(file_name) = path.file_name() else {
2737        return path.to_path_buf();
2738    };
2739
2740    fs::canonicalize(parent)
2741        .map(|canonical_parent| canonical_parent.join(file_name))
2742        .unwrap_or_else(|_| path.to_path_buf())
2743}
2744
2745fn collect_file_chunks(
2746    project_root: &Path,
2747    file: &Path,
2748    parsers: &mut HashMap<crate::parser::LangId, Parser>,
2749) -> Result<Vec<SemanticChunk>, String> {
2750    if !is_semantic_indexed_extension(file) {
2751        return Err("unsupported file extension".to_string());
2752    }
2753    let lang = detect_language(file).ok_or_else(|| "unsupported file extension".to_string())?;
2754    let source = std::fs::read_to_string(file).map_err(|error| error.to_string())?;
2755    let tree = parser_for(parsers, lang)?
2756        .parse(&source, None)
2757        .ok_or_else(|| format!("tree-sitter parse returned None for {}", file.display()))?;
2758    let symbols =
2759        extract_symbols_from_tree(&source, &tree, lang).map_err(|error| error.to_string())?;
2760
2761    Ok(symbols_to_chunks(file, &symbols, &source, project_root))
2762}
2763
2764/// Build a display snippet from a symbol's source
2765fn build_snippet(symbol: &Symbol, source: &str) -> String {
2766    let lines: Vec<&str> = source.lines().collect();
2767    let start = (symbol.range.start_line as usize).min(lines.len());
2768    // range.end_line is inclusive 0-based; +1 makes it an exclusive slice bound.
2769    let end = (symbol.range.end_line as usize + 1).min(lines.len());
2770    if start < end {
2771        let snippet_lines: Vec<&str> = lines[start..end].iter().take(5).copied().collect();
2772        let mut snippet = snippet_lines.join("\n");
2773        if end - start > 5 {
2774            snippet.push_str("\n  ...");
2775        }
2776        if snippet.len() > 300 {
2777            snippet = format!("{}...", &snippet[..snippet.floor_char_boundary(300)]);
2778        }
2779        snippet
2780    } else {
2781        String::new()
2782    }
2783}
2784
2785/// Convert symbols to semantic chunks with enriched context
2786fn symbols_to_chunks(
2787    file: &Path,
2788    symbols: &[Symbol],
2789    source: &str,
2790    project_root: &Path,
2791) -> Vec<SemanticChunk> {
2792    let mut chunks = Vec::new();
2793    let top_exports_with_signatures = symbols
2794        .iter()
2795        .filter(|symbol| {
2796            symbol.exported
2797                && symbol.parent.is_none()
2798                && !matches!(symbol.kind, SymbolKind::Heading)
2799        })
2800        .map(|symbol| (symbol.name.as_str(), symbol.signature.as_deref()))
2801        .collect::<Vec<_>>();
2802
2803    let has_only_headings = !symbols.is_empty()
2804        && symbols
2805            .iter()
2806            .all(|symbol| matches!(symbol.kind, SymbolKind::Heading));
2807    if top_exports_with_signatures.len() <= 2 && !has_only_headings {
2808        let top_exports = top_exports_with_signatures
2809            .iter()
2810            .map(|(name, _)| *name)
2811            .collect::<Vec<_>>();
2812        let top_export_signatures = top_exports_with_signatures
2813            .iter()
2814            .map(|(_, signature)| *signature)
2815            .collect::<Vec<_>>();
2816        chunks.push(build_file_summary_chunk(
2817            file,
2818            project_root,
2819            source,
2820            &top_exports,
2821            &top_export_signatures,
2822        ));
2823    }
2824
2825    for symbol in symbols {
2826        // Skip Markdown / HTML heading chunks: empirically they dominate result
2827        // lists even for code-shaped queries because heading prose embeds well.
2828        // Agents querying for code lose the actual matches under doc noise.
2829        // README/docs queries are still served by grep on the same files.
2830        if matches!(symbol.kind, SymbolKind::Heading) {
2831            continue;
2832        }
2833
2834        // Skip very small symbols (single-line variables, etc.)
2835        let line_count = symbol
2836            .range
2837            .end_line
2838            .saturating_sub(symbol.range.start_line)
2839            + 1;
2840        if line_count < 2 && !matches!(symbol.kind, SymbolKind::Variable) {
2841            continue;
2842        }
2843
2844        let embed_text = build_embed_text(symbol, source, file, project_root);
2845        let snippet = build_snippet(symbol, source);
2846
2847        chunks.push(SemanticChunk {
2848            file: file.to_path_buf(),
2849            name: symbol.name.clone(),
2850            kind: symbol.kind.clone(),
2851            start_line: symbol.range.start_line,
2852            end_line: symbol.range.end_line,
2853            exported: symbol.exported,
2854            embed_text,
2855            snippet,
2856        });
2857
2858        // Note: Nested symbols are handled separately by the outline system
2859        // Each symbol is indexed individually
2860    }
2861
2862    chunks
2863}
2864
2865/// Cosine similarity between two vectors
2866fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
2867    if a.len() != b.len() {
2868        return 0.0;
2869    }
2870
2871    let mut dot = 0.0f32;
2872    let mut norm_a = 0.0f32;
2873    let mut norm_b = 0.0f32;
2874
2875    for i in 0..a.len() {
2876        dot += a[i] * b[i];
2877        norm_a += a[i] * a[i];
2878        norm_b += b[i] * b[i];
2879    }
2880
2881    let denom = norm_a.sqrt() * norm_b.sqrt();
2882    if denom == 0.0 {
2883        0.0
2884    } else {
2885        dot / denom
2886    }
2887}
2888
2889// Serialization helpers
2890fn symbol_kind_to_u8(kind: &SymbolKind) -> u8 {
2891    match kind {
2892        SymbolKind::Function => 0,
2893        SymbolKind::Class => 1,
2894        SymbolKind::Method => 2,
2895        SymbolKind::Struct => 3,
2896        SymbolKind::Interface => 4,
2897        SymbolKind::Enum => 5,
2898        SymbolKind::TypeAlias => 6,
2899        SymbolKind::Variable => 7,
2900        SymbolKind::Heading => 8,
2901        SymbolKind::FileSummary => 9,
2902    }
2903}
2904
2905fn u8_to_symbol_kind(v: u8) -> SymbolKind {
2906    match v {
2907        0 => SymbolKind::Function,
2908        1 => SymbolKind::Class,
2909        2 => SymbolKind::Method,
2910        3 => SymbolKind::Struct,
2911        4 => SymbolKind::Interface,
2912        5 => SymbolKind::Enum,
2913        6 => SymbolKind::TypeAlias,
2914        7 => SymbolKind::Variable,
2915        8 => SymbolKind::Heading,
2916        9 => SymbolKind::FileSummary,
2917        _ => SymbolKind::Heading,
2918    }
2919}
2920
2921fn read_u32(data: &[u8], pos: &mut usize) -> Result<u32, String> {
2922    if *pos + 4 > data.len() {
2923        return Err("unexpected end of data reading u32".to_string());
2924    }
2925    let val = u32::from_le_bytes([data[*pos], data[*pos + 1], data[*pos + 2], data[*pos + 3]]);
2926    *pos += 4;
2927    Ok(val)
2928}
2929
2930fn read_u64(data: &[u8], pos: &mut usize) -> Result<u64, String> {
2931    if *pos + 8 > data.len() {
2932        return Err("unexpected end of data reading u64".to_string());
2933    }
2934    let bytes: [u8; 8] = data[*pos..*pos + 8].try_into().unwrap();
2935    *pos += 8;
2936    Ok(u64::from_le_bytes(bytes))
2937}
2938
2939fn read_string(data: &[u8], pos: &mut usize) -> Result<String, String> {
2940    let len = read_u32(data, pos)? as usize;
2941    if *pos + len > data.len() {
2942        return Err("unexpected end of data reading string".to_string());
2943    }
2944    let s = String::from_utf8_lossy(&data[*pos..*pos + len]).to_string();
2945    *pos += len;
2946    Ok(s)
2947}
2948
2949#[cfg(test)]
2950mod tests {
2951    use super::*;
2952    use crate::config::{SemanticBackend, SemanticBackendConfig};
2953    use crate::parser::FileParser;
2954    use std::io::{Read, Write};
2955    use std::net::TcpListener;
2956    use std::thread;
2957
2958    #[test]
2959    fn semantic_index_includes_php_inc_and_scss_extensions() {
2960        for file in ["partial.inc", "index.php", "styles.scss"] {
2961            assert!(
2962                is_semantic_indexed_extension(Path::new(file)),
2963                "{file} should be semantic-index eligible"
2964            );
2965        }
2966    }
2967
2968    fn start_mock_http_server<F>(handler: F) -> (String, thread::JoinHandle<()>)
2969    where
2970        F: Fn(String, String, String) -> String + Send + 'static,
2971    {
2972        let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
2973        let addr = listener.local_addr().expect("local addr");
2974        let handle = thread::spawn(move || {
2975            let (mut stream, _) = listener.accept().expect("accept request");
2976            let mut buf = Vec::new();
2977            let mut chunk = [0u8; 4096];
2978            let mut header_end = None;
2979            let mut content_length = 0usize;
2980            loop {
2981                let n = stream.read(&mut chunk).expect("read request");
2982                if n == 0 {
2983                    break;
2984                }
2985                buf.extend_from_slice(&chunk[..n]);
2986                if header_end.is_none() {
2987                    if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
2988                        header_end = Some(pos + 4);
2989                        let headers = String::from_utf8_lossy(&buf[..pos + 4]);
2990                        for line in headers.lines() {
2991                            if let Some(value) = line.strip_prefix("Content-Length:") {
2992                                content_length = value.trim().parse::<usize>().unwrap_or(0);
2993                            }
2994                        }
2995                    }
2996                }
2997                if let Some(end) = header_end {
2998                    if buf.len() >= end + content_length {
2999                        break;
3000                    }
3001                }
3002            }
3003
3004            let end = header_end.expect("header terminator");
3005            let request = String::from_utf8_lossy(&buf[..end]).to_string();
3006            let body = String::from_utf8_lossy(&buf[end..end + content_length]).to_string();
3007            let mut lines = request.lines();
3008            let request_line = lines.next().expect("request line").to_string();
3009            let path = request_line
3010                .split_whitespace()
3011                .nth(1)
3012                .expect("request path")
3013                .to_string();
3014            let response_body = handler(request_line, path, body);
3015            let response = format!(
3016                "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
3017                response_body.len(),
3018                response_body
3019            );
3020            stream
3021                .write_all(response.as_bytes())
3022                .expect("write response");
3023        });
3024
3025        (format!("http://{}", addr), handle)
3026    }
3027
3028    fn test_vector_for_texts(texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
3029        Ok(texts.iter().map(|_| vec![1.0, 0.0, 0.0]).collect())
3030    }
3031
3032    fn write_rust_file(path: &Path, function_name: &str) {
3033        fs::write(
3034            path,
3035            format!("pub fn {function_name}() -> bool {{\n    true\n}}\n"),
3036        )
3037        .unwrap();
3038    }
3039
3040    fn build_test_index(project_root: &Path, files: &[PathBuf]) -> SemanticIndex {
3041        let mut embed = test_vector_for_texts;
3042        SemanticIndex::build(project_root, files, &mut embed, 8).unwrap()
3043    }
3044
3045    fn test_project_root() -> PathBuf {
3046        std::env::current_dir().unwrap()
3047    }
3048
3049    fn set_file_metadata(index: &mut SemanticIndex, file: &Path, mtime: SystemTime, size: u64) {
3050        index.file_mtimes.insert(file.to_path_buf(), mtime);
3051        index.file_sizes.insert(file.to_path_buf(), size);
3052        index
3053            .file_hashes
3054            .insert(file.to_path_buf(), cache_freshness::zero_hash());
3055    }
3056
3057    #[test]
3058    fn semantic_cache_serialization_skips_paths_outside_project_root() {
3059        let dir = tempfile::tempdir().expect("create temp dir");
3060        let project = fs::canonicalize(dir.path()).expect("canonical project");
3061        let outside = project.join("..").join("outside.rs");
3062        let mut index = SemanticIndex::new(project.clone(), 3);
3063        index
3064            .file_mtimes
3065            .insert(outside.clone(), SystemTime::UNIX_EPOCH);
3066        index.file_sizes.insert(outside.clone(), 1);
3067        index
3068            .file_hashes
3069            .insert(outside.clone(), cache_freshness::zero_hash());
3070        index.entries.push(EmbeddingEntry {
3071            chunk: SemanticChunk {
3072                file: outside,
3073                name: "outside".to_string(),
3074                kind: SymbolKind::Function,
3075                start_line: 0,
3076                end_line: 0,
3077                exported: false,
3078                embed_text: "outside".to_string(),
3079                snippet: "outside".to_string(),
3080            },
3081            vector: vec![1.0, 0.0, 0.0],
3082        });
3083
3084        let bytes = index.to_bytes();
3085        let loaded = SemanticIndex::from_bytes(&bytes, &project).expect("load serialized index");
3086        assert_eq!(loaded.entries.len(), 0);
3087        assert!(loaded.file_mtimes.is_empty());
3088    }
3089
3090    #[test]
3091    fn test_cosine_similarity_identical() {
3092        let a = vec![1.0, 0.0, 0.0];
3093        let b = vec![1.0, 0.0, 0.0];
3094        assert!((cosine_similarity(&a, &b) - 1.0).abs() < 0.001);
3095    }
3096
3097    #[test]
3098    fn test_cosine_similarity_orthogonal() {
3099        let a = vec![1.0, 0.0, 0.0];
3100        let b = vec![0.0, 1.0, 0.0];
3101        assert!(cosine_similarity(&a, &b).abs() < 0.001);
3102    }
3103
3104    #[test]
3105    fn test_cosine_similarity_opposite() {
3106        let a = vec![1.0, 0.0, 0.0];
3107        let b = vec![-1.0, 0.0, 0.0];
3108        assert!((cosine_similarity(&a, &b) + 1.0).abs() < 0.001);
3109    }
3110
3111    #[test]
3112    fn test_serialization_roundtrip() {
3113        let project_root = test_project_root();
3114        let file = project_root.join("src/main.rs");
3115        let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
3116        index.entries.push(EmbeddingEntry {
3117            chunk: SemanticChunk {
3118                file: file.clone(),
3119                name: "handle_request".to_string(),
3120                kind: SymbolKind::Function,
3121                start_line: 10,
3122                end_line: 25,
3123                exported: true,
3124                embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
3125                snippet: "fn handle_request() {\n  // ...\n}".to_string(),
3126            },
3127            vector: vec![0.1, 0.2, 0.3, 0.4],
3128        });
3129        index.dimension = 4;
3130        index
3131            .file_mtimes
3132            .insert(file.clone(), SystemTime::UNIX_EPOCH);
3133        index.file_sizes.insert(file, 0);
3134        index.set_fingerprint(SemanticIndexFingerprint {
3135            backend: "fastembed".to_string(),
3136            model: "all-MiniLM-L6-v2".to_string(),
3137            base_url: FALLBACK_BACKEND.to_string(),
3138            dimension: 4,
3139            chunking_version: default_chunking_version(),
3140        });
3141
3142        let bytes = index.to_bytes();
3143        let restored = SemanticIndex::from_bytes(&bytes, &project_root).unwrap();
3144
3145        assert_eq!(restored.entries.len(), 1);
3146        assert_eq!(restored.entries[0].chunk.name, "handle_request");
3147        assert_eq!(restored.entries[0].vector, vec![0.1, 0.2, 0.3, 0.4]);
3148        assert_eq!(restored.dimension, 4);
3149        assert_eq!(restored.backend_label(), Some("fastembed"));
3150        assert_eq!(restored.model_label(), Some("all-MiniLM-L6-v2"));
3151    }
3152
3153    #[test]
3154    fn symbol_kind_serialization_roundtrip_includes_file_summary_variant() {
3155        let cases = [
3156            (SymbolKind::Function, 0),
3157            (SymbolKind::Class, 1),
3158            (SymbolKind::Method, 2),
3159            (SymbolKind::Struct, 3),
3160            (SymbolKind::Interface, 4),
3161            (SymbolKind::Enum, 5),
3162            (SymbolKind::TypeAlias, 6),
3163            (SymbolKind::Variable, 7),
3164            (SymbolKind::Heading, 8),
3165            (SymbolKind::FileSummary, 9),
3166        ];
3167
3168        for (kind, encoded) in cases {
3169            assert_eq!(symbol_kind_to_u8(&kind), encoded);
3170            assert_eq!(u8_to_symbol_kind(encoded), kind);
3171        }
3172    }
3173
3174    #[test]
3175    fn test_search_top_k() {
3176        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3177        index.dimension = 3;
3178
3179        // Add entries with known vectors
3180        for (i, name) in ["auth", "database", "handler"].iter().enumerate() {
3181            let mut vec = vec![0.0f32; 3];
3182            vec[i] = 1.0; // orthogonal vectors
3183            index.entries.push(EmbeddingEntry {
3184                chunk: SemanticChunk {
3185                    file: PathBuf::from("/src/lib.rs"),
3186                    name: name.to_string(),
3187                    kind: SymbolKind::Function,
3188                    start_line: (i * 10 + 1) as u32,
3189                    end_line: (i * 10 + 5) as u32,
3190                    exported: true,
3191                    embed_text: format!("kind:function name:{}", name),
3192                    snippet: format!("fn {}() {{}}", name),
3193                },
3194                vector: vec,
3195            });
3196        }
3197
3198        // Query aligned with "auth" (index 0)
3199        let query = vec![0.9, 0.1, 0.0];
3200        let results = index.search(&query, 2);
3201
3202        assert_eq!(results.len(), 2);
3203        assert_eq!(results[0].name, "auth"); // highest score
3204        assert!(results[0].score > results[1].score);
3205    }
3206
3207    #[test]
3208    fn test_empty_index_search() {
3209        let index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3210        let results = index.search(&[0.1, 0.2, 0.3], 10);
3211        assert!(results.is_empty());
3212    }
3213
3214    #[test]
3215    fn single_line_symbol_builds_non_empty_snippet() {
3216        let symbol = Symbol {
3217            name: "answer".to_string(),
3218            kind: SymbolKind::Variable,
3219            range: crate::symbols::Range {
3220                start_line: 0,
3221                start_col: 0,
3222                end_line: 0,
3223                end_col: 24,
3224            },
3225            signature: Some("const answer = 42".to_string()),
3226            scope_chain: Vec::new(),
3227            exported: true,
3228            parent: None,
3229        };
3230        let source = "export const answer = 42;\n";
3231
3232        let snippet = build_snippet(&symbol, source);
3233
3234        assert_eq!(snippet, "export const answer = 42;");
3235    }
3236
3237    #[test]
3238    fn optimized_file_chunk_collection_matches_file_parser_path() {
3239        let project_root = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
3240        let file = project_root.join("src/semantic_index.rs");
3241        let source = std::fs::read_to_string(&file).unwrap();
3242
3243        let mut legacy_parser = FileParser::new();
3244        let legacy_symbols = legacy_parser.extract_symbols(&file).unwrap();
3245        let legacy_chunks = symbols_to_chunks(&file, &legacy_symbols, &source, &project_root);
3246
3247        let mut parsers = HashMap::new();
3248        let optimized_chunks = collect_file_chunks(&project_root, &file, &mut parsers).unwrap();
3249
3250        assert_eq!(
3251            chunk_fingerprint(&optimized_chunks),
3252            chunk_fingerprint(&legacy_chunks)
3253        );
3254    }
3255
3256    fn chunk_fingerprint(
3257        chunks: &[SemanticChunk],
3258    ) -> Vec<(String, SymbolKind, u32, u32, bool, String, String)> {
3259        chunks
3260            .iter()
3261            .map(|chunk| {
3262                (
3263                    chunk.name.clone(),
3264                    chunk.kind.clone(),
3265                    chunk.start_line,
3266                    chunk.end_line,
3267                    chunk.exported,
3268                    chunk.embed_text.clone(),
3269                    chunk.snippet.clone(),
3270                )
3271            })
3272            .collect()
3273    }
3274
3275    #[test]
3276    fn rejects_oversized_dimension_during_deserialization() {
3277        let mut bytes = Vec::new();
3278        bytes.push(1u8);
3279        bytes.extend_from_slice(&((MAX_DIMENSION as u32) + 1).to_le_bytes());
3280        bytes.extend_from_slice(&0u32.to_le_bytes());
3281        bytes.extend_from_slice(&0u32.to_le_bytes());
3282
3283        assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
3284    }
3285
3286    #[test]
3287    fn rejects_oversized_entry_count_during_deserialization() {
3288        let mut bytes = Vec::new();
3289        bytes.push(1u8);
3290        bytes.extend_from_slice(&(DEFAULT_DIMENSION as u32).to_le_bytes());
3291        bytes.extend_from_slice(&((MAX_ENTRIES as u32) + 1).to_le_bytes());
3292        bytes.extend_from_slice(&0u32.to_le_bytes());
3293
3294        assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
3295    }
3296
3297    #[test]
3298    fn invalidate_file_removes_entries_and_mtime() {
3299        let target = PathBuf::from("/src/main.rs");
3300        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3301        index.entries.push(EmbeddingEntry {
3302            chunk: SemanticChunk {
3303                file: target.clone(),
3304                name: "main".to_string(),
3305                kind: SymbolKind::Function,
3306                start_line: 0,
3307                end_line: 1,
3308                exported: false,
3309                embed_text: "main".to_string(),
3310                snippet: "fn main() {}".to_string(),
3311            },
3312            vector: vec![1.0; DEFAULT_DIMENSION],
3313        });
3314        index
3315            .file_mtimes
3316            .insert(target.clone(), SystemTime::UNIX_EPOCH);
3317        index.file_sizes.insert(target.clone(), 0);
3318
3319        index.invalidate_file(&target);
3320
3321        assert!(index.entries.is_empty());
3322        assert!(!index.file_mtimes.contains_key(&target));
3323        assert!(!index.file_sizes.contains_key(&target));
3324    }
3325
3326    #[test]
3327    fn refresh_missing_changed_file_is_purged_after_collect() {
3328        let temp = tempfile::tempdir().unwrap();
3329        let project_root = temp.path();
3330        let file = project_root.join("src/lib.rs");
3331        fs::create_dir_all(file.parent().unwrap()).unwrap();
3332        write_rust_file(&file, "vanished_symbol");
3333
3334        let mut index = build_test_index(project_root, std::slice::from_ref(&file));
3335        let original_size = *index.file_sizes.get(&file).unwrap();
3336        set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, original_size + 1);
3337        fs::remove_file(&file).unwrap();
3338
3339        let mut embed = test_vector_for_texts;
3340        let mut progress = |_done: usize, _total: usize| {};
3341        let summary = index
3342            .refresh_stale_files(
3343                project_root,
3344                std::slice::from_ref(&file),
3345                &mut embed,
3346                8,
3347                &mut progress,
3348            )
3349            .unwrap();
3350
3351        assert_eq!(summary.changed, 0);
3352        assert_eq!(summary.added, 0);
3353        assert_eq!(summary.deleted, 1);
3354        assert!(index.entries.is_empty());
3355        assert!(!index.file_mtimes.contains_key(&file));
3356        assert!(!index.file_sizes.contains_key(&file));
3357        assert!(!index.file_hashes.contains_key(&file));
3358    }
3359
3360    #[test]
3361    fn refresh_collect_error_for_existing_path_preserves_cached_entry() {
3362        let temp = tempfile::tempdir().unwrap();
3363        let project_root = temp.path();
3364        let file = project_root.join("src/lib.rs");
3365        fs::create_dir_all(file.parent().unwrap()).unwrap();
3366        write_rust_file(&file, "kept_symbol");
3367
3368        let mut index = build_test_index(project_root, std::slice::from_ref(&file));
3369        let original_entry_count = index.entries.len();
3370        let original_mtime = *index.file_mtimes.get(&file).unwrap();
3371        let original_size = *index.file_sizes.get(&file).unwrap();
3372
3373        let stale_mtime = SystemTime::UNIX_EPOCH;
3374        set_file_metadata(&mut index, &file, stale_mtime, original_size + 1);
3375        fs::remove_file(&file).unwrap();
3376        fs::create_dir(&file).unwrap();
3377
3378        let mut embed = test_vector_for_texts;
3379        let mut progress = |_done: usize, _total: usize| {};
3380        let summary = index
3381            .refresh_stale_files(
3382                project_root,
3383                std::slice::from_ref(&file),
3384                &mut embed,
3385                8,
3386                &mut progress,
3387            )
3388            .unwrap();
3389
3390        assert_eq!(summary.changed, 0);
3391        assert_eq!(summary.added, 0);
3392        assert_eq!(summary.deleted, 0);
3393        assert_eq!(index.entries.len(), original_entry_count);
3394        assert!(index
3395            .entries
3396            .iter()
3397            .any(|entry| entry.chunk.name == "kept_symbol"));
3398        assert_eq!(index.file_mtimes.get(&file), Some(&stale_mtime));
3399        assert_ne!(index.file_mtimes.get(&file), Some(&original_mtime));
3400        assert_eq!(index.file_sizes.get(&file), Some(&(original_size + 1)));
3401    }
3402
3403    #[test]
3404    fn refresh_never_indexed_file_error_does_not_record_mtime() {
3405        let temp = tempfile::tempdir().unwrap();
3406        let project_root = temp.path();
3407        let missing = project_root.join("src/missing.rs");
3408        fs::create_dir_all(missing.parent().unwrap()).unwrap();
3409
3410        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3411        let mut embed = test_vector_for_texts;
3412        let mut progress = |_done: usize, _total: usize| {};
3413        let summary = index
3414            .refresh_stale_files(
3415                project_root,
3416                std::slice::from_ref(&missing),
3417                &mut embed,
3418                8,
3419                &mut progress,
3420            )
3421            .unwrap();
3422
3423        assert_eq!(summary.added, 0);
3424        assert_eq!(summary.changed, 0);
3425        assert_eq!(summary.deleted, 0);
3426        assert!(!index.file_mtimes.contains_key(&missing));
3427        assert!(!index.file_sizes.contains_key(&missing));
3428        assert!(index.entries.is_empty());
3429    }
3430
3431    #[test]
3432    fn refresh_reports_added_for_new_files() {
3433        let temp = tempfile::tempdir().unwrap();
3434        let project_root = temp.path();
3435        let existing = project_root.join("src/lib.rs");
3436        let added = project_root.join("src/new.rs");
3437        fs::create_dir_all(existing.parent().unwrap()).unwrap();
3438        write_rust_file(&existing, "existing_symbol");
3439        write_rust_file(&added, "added_symbol");
3440
3441        let mut index = build_test_index(project_root, std::slice::from_ref(&existing));
3442        let mut embed = test_vector_for_texts;
3443        let mut progress = |_done: usize, _total: usize| {};
3444        let summary = index
3445            .refresh_stale_files(
3446                project_root,
3447                &[existing.clone(), added.clone()],
3448                &mut embed,
3449                8,
3450                &mut progress,
3451            )
3452            .unwrap();
3453
3454        assert_eq!(summary.added, 1);
3455        assert_eq!(summary.changed, 0);
3456        assert_eq!(summary.deleted, 0);
3457        assert_eq!(summary.total_processed, 2);
3458        assert!(index.file_mtimes.contains_key(&added));
3459        assert!(index.entries.iter().any(|entry| entry.chunk.file == added));
3460    }
3461
3462    #[test]
3463    fn refresh_reports_deleted_for_removed_files() {
3464        let temp = tempfile::tempdir().unwrap();
3465        let project_root = temp.path();
3466        let deleted = project_root.join("src/deleted.rs");
3467        fs::create_dir_all(deleted.parent().unwrap()).unwrap();
3468        write_rust_file(&deleted, "deleted_symbol");
3469
3470        let mut index = build_test_index(project_root, std::slice::from_ref(&deleted));
3471        fs::remove_file(&deleted).unwrap();
3472
3473        let mut embed = test_vector_for_texts;
3474        let mut progress = |_done: usize, _total: usize| {};
3475        let summary = index
3476            .refresh_stale_files(project_root, &[], &mut embed, 8, &mut progress)
3477            .unwrap();
3478
3479        assert_eq!(summary.deleted, 1);
3480        assert_eq!(summary.changed, 0);
3481        assert_eq!(summary.added, 0);
3482        assert_eq!(summary.total_processed, 1);
3483        assert!(!index.file_mtimes.contains_key(&deleted));
3484        assert!(index.entries.is_empty());
3485    }
3486
3487    #[test]
3488    fn refresh_reports_changed_for_modified_files() {
3489        let temp = tempfile::tempdir().unwrap();
3490        let project_root = temp.path();
3491        let file = project_root.join("src/lib.rs");
3492        fs::create_dir_all(file.parent().unwrap()).unwrap();
3493        write_rust_file(&file, "old_symbol");
3494
3495        let mut index = build_test_index(project_root, std::slice::from_ref(&file));
3496        set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, 0);
3497        write_rust_file(&file, "new_symbol");
3498
3499        let mut embed = test_vector_for_texts;
3500        let mut progress = |_done: usize, _total: usize| {};
3501        let summary = index
3502            .refresh_stale_files(
3503                project_root,
3504                std::slice::from_ref(&file),
3505                &mut embed,
3506                8,
3507                &mut progress,
3508            )
3509            .unwrap();
3510
3511        assert_eq!(summary.changed, 1);
3512        assert_eq!(summary.added, 0);
3513        assert_eq!(summary.deleted, 0);
3514        assert_eq!(summary.total_processed, 1);
3515        assert!(index
3516            .entries
3517            .iter()
3518            .any(|entry| entry.chunk.name == "new_symbol"));
3519        assert!(!index
3520            .entries
3521            .iter()
3522            .any(|entry| entry.chunk.name == "old_symbol"));
3523    }
3524
3525    #[test]
3526    fn refresh_all_clean_reports_zero_counts_and_no_embedding_work() {
3527        let temp = tempfile::tempdir().unwrap();
3528        let project_root = temp.path();
3529        let file = project_root.join("src/lib.rs");
3530        fs::create_dir_all(file.parent().unwrap()).unwrap();
3531        write_rust_file(&file, "clean_symbol");
3532
3533        let mut index = build_test_index(project_root, std::slice::from_ref(&file));
3534        let original_entries = index.entries.len();
3535        let mut embed_called = false;
3536        let mut embed = |texts: Vec<String>| {
3537            embed_called = true;
3538            test_vector_for_texts(texts)
3539        };
3540        let mut progress = |_done: usize, _total: usize| {};
3541        let summary = index
3542            .refresh_stale_files(
3543                project_root,
3544                std::slice::from_ref(&file),
3545                &mut embed,
3546                8,
3547                &mut progress,
3548            )
3549            .unwrap();
3550
3551        assert!(summary.is_noop());
3552        assert_eq!(summary.total_processed, 1);
3553        assert!(!embed_called);
3554        assert_eq!(index.entries.len(), original_entries);
3555    }
3556
3557    #[test]
3558    fn detects_missing_onnx_runtime_from_dynamic_load_error() {
3559        let message = "Failed to load ONNX Runtime shared library libonnxruntime.dylib via dlopen: no such file";
3560
3561        assert!(is_onnx_runtime_unavailable(message));
3562    }
3563
3564    #[test]
3565    fn formats_missing_onnx_runtime_with_install_hint() {
3566        let message = format_embedding_init_error(
3567            "Failed to load ONNX Runtime shared library libonnxruntime.so via dlopen: no such file",
3568        );
3569
3570        assert!(message.starts_with("ONNX Runtime not found. Install via:"));
3571        assert!(message.contains("Original error:"));
3572    }
3573
3574    #[test]
3575    fn openai_compatible_backend_embeds_with_mock_server() {
3576        let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
3577            assert!(request_line.starts_with("POST "));
3578            assert_eq!(path, "/v1/embeddings");
3579            "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0},{\"embedding\":[0.4,0.5,0.6],\"index\":1}]}".to_string()
3580        });
3581
3582        let config = SemanticBackendConfig {
3583            backend: SemanticBackend::OpenAiCompatible,
3584            model: "test-embedding".to_string(),
3585            base_url: Some(base_url),
3586            api_key_env: None,
3587            timeout_ms: 5_000,
3588            max_batch_size: 64,
3589            max_files: 20_000,
3590        };
3591
3592        let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
3593        let vectors = model
3594            .embed(vec!["hello".to_string(), "world".to_string()])
3595            .unwrap();
3596
3597        assert_eq!(vectors, vec![vec![0.1, 0.2, 0.3], vec![0.4, 0.5, 0.6]]);
3598        handle.join().unwrap();
3599    }
3600
3601    /// Regression for issue #36: AFT was sending TWO Content-Type headers
3602    /// on the OpenAI embeddings request — once implicitly via `.json(&body)`
3603    /// and again explicitly via `.header("Content-Type", "application/json")`.
3604    /// reqwest's `.header()` calls `HeaderMap::append`, which produces two
3605    /// headers on the wire. OpenAI's /v1/embeddings endpoint rejects that
3606    /// with `HTTP 400 "you must provide a model parameter"` even though the
3607    /// body actually contains `model`. The fix is to drop the explicit
3608    /// `.header("Content-Type", ...)` call. This test pins that we send
3609    /// exactly one Content-Type header.
3610    #[test]
3611    fn openai_compatible_request_has_single_content_type_header() {
3612        use std::sync::{Arc, Mutex};
3613        let captured: Arc<Mutex<Vec<u8>>> = Arc::new(Mutex::new(Vec::new()));
3614        let captured_for_thread = Arc::clone(&captured);
3615
3616        let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
3617        let addr = listener.local_addr().expect("local addr");
3618        let handle = thread::spawn(move || {
3619            let (mut stream, _) = listener.accept().expect("accept");
3620            let mut buf = Vec::new();
3621            let mut chunk = [0u8; 4096];
3622            let mut header_end = None;
3623            let mut content_length = 0usize;
3624            loop {
3625                let n = stream.read(&mut chunk).expect("read");
3626                if n == 0 {
3627                    break;
3628                }
3629                buf.extend_from_slice(&chunk[..n]);
3630                if header_end.is_none() {
3631                    if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
3632                        header_end = Some(pos + 4);
3633                        for line in String::from_utf8_lossy(&buf[..pos + 4]).lines() {
3634                            if let Some(value) = line.strip_prefix("Content-Length:") {
3635                                content_length = value.trim().parse::<usize>().unwrap_or(0);
3636                            }
3637                        }
3638                    }
3639                }
3640                if let Some(end) = header_end {
3641                    if buf.len() >= end + content_length {
3642                        break;
3643                    }
3644                }
3645            }
3646            *captured_for_thread.lock().unwrap() = buf;
3647            let body = "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0}]}";
3648            let response = format!(
3649                "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
3650                body.len(),
3651                body
3652            );
3653            let _ = stream.write_all(response.as_bytes());
3654        });
3655
3656        let config = SemanticBackendConfig {
3657            backend: SemanticBackend::OpenAiCompatible,
3658            model: "text-embedding-3-small".to_string(),
3659            base_url: Some(format!("http://{}", addr)),
3660            api_key_env: None,
3661            timeout_ms: 5_000,
3662            max_batch_size: 64,
3663            max_files: 20_000,
3664        };
3665        let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
3666        let _ = model.embed(vec!["probe".to_string()]).unwrap();
3667        handle.join().unwrap();
3668
3669        let bytes = captured.lock().unwrap().clone();
3670        let request = String::from_utf8_lossy(&bytes);
3671
3672        // Lowercase line counts because HTTP headers are case-insensitive
3673        // and reqwest may emit `content-type` in lowercase under HTTP/2.
3674        let content_type_lines = request
3675            .lines()
3676            .filter(|line| {
3677                let lower = line.to_ascii_lowercase();
3678                lower.starts_with("content-type:")
3679            })
3680            .count();
3681        assert_eq!(
3682            content_type_lines, 1,
3683            "expected exactly one Content-Type header but found {content_type_lines}; full request:\n{request}",
3684        );
3685
3686        // The body must still include the model field — pin this so a future
3687        // change can't accidentally drop `model` while fixing duplicate headers.
3688        assert!(
3689            request.contains(r#""model":"text-embedding-3-small""#),
3690            "request body should contain model field; full request:\n{request}",
3691        );
3692    }
3693
3694    #[test]
3695    fn ollama_backend_embeds_with_mock_server() {
3696        let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
3697            assert!(request_line.starts_with("POST "));
3698            assert_eq!(path, "/api/embed");
3699            "{\"embeddings\":[[0.7,0.8,0.9],[1.0,1.1,1.2]]}".to_string()
3700        });
3701
3702        let config = SemanticBackendConfig {
3703            backend: SemanticBackend::Ollama,
3704            model: "embeddinggemma".to_string(),
3705            base_url: Some(base_url),
3706            api_key_env: None,
3707            timeout_ms: 5_000,
3708            max_batch_size: 64,
3709            max_files: 20_000,
3710        };
3711
3712        let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
3713        let vectors = model
3714            .embed(vec!["hello".to_string(), "world".to_string()])
3715            .unwrap();
3716
3717        assert_eq!(vectors, vec![vec![0.7, 0.8, 0.9], vec![1.0, 1.1, 1.2]]);
3718        handle.join().unwrap();
3719    }
3720
3721    #[test]
3722    fn read_from_disk_rejects_fingerprint_mismatch() {
3723        let storage = tempfile::tempdir().unwrap();
3724        let project_key = "proj";
3725
3726        let project_root = test_project_root();
3727        let file = project_root.join("src/main.rs");
3728        let mut index = SemanticIndex::new(project_root.clone(), DEFAULT_DIMENSION);
3729        index.entries.push(EmbeddingEntry {
3730            chunk: SemanticChunk {
3731                file: file.clone(),
3732                name: "handle_request".to_string(),
3733                kind: SymbolKind::Function,
3734                start_line: 10,
3735                end_line: 25,
3736                exported: true,
3737                embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
3738                snippet: "fn handle_request() {}".to_string(),
3739            },
3740            vector: vec![0.1, 0.2, 0.3],
3741        });
3742        index.dimension = 3;
3743        index
3744            .file_mtimes
3745            .insert(file.clone(), SystemTime::UNIX_EPOCH);
3746        index.file_sizes.insert(file, 0);
3747        index.set_fingerprint(SemanticIndexFingerprint {
3748            backend: "openai_compatible".to_string(),
3749            model: "test-embedding".to_string(),
3750            base_url: "http://127.0.0.1:1234/v1".to_string(),
3751            dimension: 3,
3752            chunking_version: default_chunking_version(),
3753        });
3754        index.write_to_disk(storage.path(), project_key);
3755
3756        let matching = index.fingerprint().unwrap().as_string();
3757        assert!(SemanticIndex::read_from_disk(
3758            storage.path(),
3759            project_key,
3760            &project_root,
3761            false,
3762            Some(&matching),
3763        )
3764        .is_some());
3765
3766        let mismatched = SemanticIndexFingerprint {
3767            backend: "ollama".to_string(),
3768            model: "embeddinggemma".to_string(),
3769            base_url: "http://127.0.0.1:11434".to_string(),
3770            dimension: 3,
3771            chunking_version: default_chunking_version(),
3772        }
3773        .as_string();
3774        assert!(SemanticIndex::read_from_disk(
3775            storage.path(),
3776            project_key,
3777            &project_root,
3778            false,
3779            Some(&mismatched),
3780        )
3781        .is_none());
3782    }
3783
3784    #[test]
3785    fn read_from_disk_rejects_v3_cache_for_snippet_rebuild() {
3786        let storage = tempfile::tempdir().unwrap();
3787        let project_key = "proj-v3";
3788        let dir = storage.path().join("semantic").join(project_key);
3789        fs::create_dir_all(&dir).unwrap();
3790
3791        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3792        index.entries.push(EmbeddingEntry {
3793            chunk: SemanticChunk {
3794                file: PathBuf::from("/src/main.rs"),
3795                name: "handle_request".to_string(),
3796                kind: SymbolKind::Function,
3797                start_line: 0,
3798                end_line: 0,
3799                exported: true,
3800                embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
3801                snippet: "fn handle_request() {}".to_string(),
3802            },
3803            vector: vec![0.1, 0.2, 0.3],
3804        });
3805        index.dimension = 3;
3806        index
3807            .file_mtimes
3808            .insert(PathBuf::from("/src/main.rs"), SystemTime::UNIX_EPOCH);
3809        index.file_sizes.insert(PathBuf::from("/src/main.rs"), 0);
3810        let fingerprint = SemanticIndexFingerprint {
3811            backend: "fastembed".to_string(),
3812            model: "test".to_string(),
3813            base_url: FALLBACK_BACKEND.to_string(),
3814            dimension: 3,
3815            chunking_version: default_chunking_version(),
3816        };
3817        index.set_fingerprint(fingerprint.clone());
3818
3819        let mut bytes = index.to_bytes();
3820        bytes[0] = SEMANTIC_INDEX_VERSION_V3;
3821        fs::write(dir.join("semantic.bin"), bytes).unwrap();
3822
3823        assert!(SemanticIndex::read_from_disk(
3824            storage.path(),
3825            project_key,
3826            &test_project_root(),
3827            false,
3828            Some(&fingerprint.as_string())
3829        )
3830        .is_none());
3831        assert!(!dir.join("semantic.bin").exists());
3832    }
3833
3834    fn make_symbol(kind: SymbolKind, name: &str, start: u32, end: u32) -> crate::symbols::Symbol {
3835        crate::symbols::Symbol {
3836            name: name.to_string(),
3837            kind,
3838            range: crate::symbols::Range {
3839                start_line: start,
3840                start_col: 0,
3841                end_line: end,
3842                end_col: 0,
3843            },
3844            signature: None,
3845            scope_chain: Vec::new(),
3846            exported: false,
3847            parent: None,
3848        }
3849    }
3850
3851    /// Heading symbols (Markdown / HTML headings) must NOT be indexed —
3852    /// they overwhelmingly dominated semantic results even on code-shaped
3853    /// queries because heading prose embeds far more strongly than code
3854    /// chunks. Skipping headings keeps aft_search a code-finder.
3855    #[test]
3856    fn symbols_to_chunks_skips_heading_symbols() {
3857        let project_root = PathBuf::from("/proj");
3858        let file = project_root.join("README.md");
3859        let source = "# Title\n\nbody text\n\n## Section\n\nmore text\n";
3860
3861        let symbols = vec![
3862            make_symbol(SymbolKind::Heading, "Title", 0, 2),
3863            make_symbol(SymbolKind::Heading, "Section", 4, 6),
3864        ];
3865
3866        let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
3867        assert!(
3868            chunks.is_empty(),
3869            "Heading symbols must be filtered out before embedding; got {} chunk(s)",
3870            chunks.len()
3871        );
3872    }
3873
3874    /// A symbol with an enormous signature (e.g. a YAML/Kubernetes CronJob
3875    /// whose inline `command:` script is parsed into the signature) must not
3876    /// produce an embed_text that overflows the embedding backend's physical
3877    /// batch. Before the clamp, the unbounded `signature:` append created a
3878    /// multi-KB input that aborted the whole index build and degraded every
3879    /// search to lexical-only.
3880    #[test]
3881    fn build_embed_text_clamps_oversized_signature() {
3882        let project_root = PathBuf::from("/proj");
3883        let file = project_root.join("cronjob.yaml");
3884        let huge_sig = "kubectl ".repeat(2000); // ~16 KB
3885        let source = "apiVersion: batch/v1\nkind: CronJob\n";
3886
3887        let mut symbol = make_symbol(SymbolKind::Class, "cluster-janitor", 0, 1);
3888        symbol.signature = Some(huge_sig);
3889
3890        let text = build_embed_text(&symbol, source, &file, &project_root);
3891        assert!(
3892            text.chars().count() <= MAX_EMBED_TEXT_CHARS,
3893            "embed_text must be clamped to {} chars, got {}",
3894            MAX_EMBED_TEXT_CHARS,
3895            text.chars().count()
3896        );
3897    }
3898
3899    /// Code symbols (functions, classes, methods, structs, etc.) must still
3900    /// be indexed alongside the heading skip — otherwise we'd starve the
3901    /// index entirely.
3902    #[test]
3903    fn symbols_to_chunks_keeps_code_symbols_alongside_skipped_headings() {
3904        let project_root = PathBuf::from("/proj");
3905        let file = project_root.join("src/lib.rs");
3906        let source = "pub fn handle_request() -> bool {\n    true\n}\n";
3907
3908        let symbols = vec![
3909            // A heading mixed in (e.g. from a doc comment block elsewhere).
3910            make_symbol(SymbolKind::Heading, "doc heading", 0, 1),
3911            make_symbol(SymbolKind::Function, "handle_request", 0, 2),
3912            make_symbol(SymbolKind::Struct, "AuthService", 4, 6),
3913        ];
3914
3915        let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
3916        assert_eq!(
3917            chunks.len(),
3918            3,
3919            "Expected file-summary + 2 code chunks (Function + Struct), got {}",
3920            chunks.len()
3921        );
3922        let names: Vec<&str> = chunks.iter().map(|c| c.name.as_str()).collect();
3923        assert!(chunks
3924            .iter()
3925            .any(|chunk| matches!(chunk.kind, SymbolKind::FileSummary)));
3926        assert!(names.contains(&"handle_request"));
3927        assert!(names.contains(&"AuthService"));
3928        assert!(
3929            !names.contains(&"doc heading"),
3930            "Heading symbol leaked into chunks: {names:?}"
3931        );
3932    }
3933
3934    #[test]
3935    fn validate_ssrf_allows_loopback_hostnames() {
3936        // Loopback hostnames are explicitly allowed so self-hosted backends
3937        // (Ollama at http://localhost:11434) work at their default config.
3938        for host in &[
3939            "http://localhost",
3940            "http://localhost:8080",
3941            "http://localhost:11434", // Ollama default
3942            "http://localhost.localdomain",
3943            "http://foo.localhost",
3944        ] {
3945            assert!(
3946                validate_base_url_no_ssrf(host).is_ok(),
3947                "Expected {host} to be allowed (loopback), got: {:?}",
3948                validate_base_url_no_ssrf(host)
3949            );
3950        }
3951    }
3952
3953    #[test]
3954    fn validate_ssrf_allows_loopback_ips() {
3955        // 127.0.0.0/8 is loopback — by definition same-machine and not an
3956        // SSRF target. Allow it so Ollama at http://127.0.0.1:11434 works.
3957        for url in &[
3958            "http://127.0.0.1",
3959            "http://127.0.0.1:11434", // Ollama default
3960            "http://127.0.0.1:8080",
3961            "http://127.1.2.3",
3962        ] {
3963            let result = validate_base_url_no_ssrf(url);
3964            assert!(
3965                result.is_ok(),
3966                "Expected {url} to be allowed (loopback), got: {:?}",
3967                result
3968            );
3969        }
3970    }
3971
3972    #[test]
3973    fn validate_ssrf_rejects_private_non_loopback_ips() {
3974        // Non-loopback private/reserved IPs remain rejected — homelab/intranet
3975        // services on LAN IPs are real SSRF targets even though the user
3976        // configured them. Users who want this can opt in by binding the
3977        // service to a public-routable address.
3978        for url in &[
3979            "http://192.168.1.1",
3980            "http://10.0.0.1",
3981            "http://172.16.0.1",
3982            "http://169.254.169.254",
3983            "http://100.64.0.1",
3984        ] {
3985            let result = validate_base_url_no_ssrf(url);
3986            assert!(
3987                result.is_err(),
3988                "Expected {url} to be rejected (non-loopback private), got: {:?}",
3989                result
3990            );
3991        }
3992    }
3993
3994    #[test]
3995    fn validate_ssrf_rejects_mdns_local_hostnames() {
3996        // mDNS .local hostnames typically resolve to LAN devices, not
3997        // loopback. Rejecting them before DNS lookup gives a clearer error.
3998        for host in &[
3999            "http://printer.local",
4000            "http://nas.local:8080",
4001            "http://homelab.local",
4002        ] {
4003            let result = validate_base_url_no_ssrf(host);
4004            assert!(
4005                result.is_err(),
4006                "Expected {host} to be rejected (mDNS), got: {:?}",
4007                result
4008            );
4009        }
4010    }
4011
4012    #[test]
4013    fn normalize_base_url_allows_localhost_for_tests() {
4014        // normalize_base_url itself should NOT block localhost — only
4015        // validate_base_url_no_ssrf does. Tests construct backends directly.
4016        assert!(normalize_base_url("http://127.0.0.1:9999").is_ok());
4017        assert!(normalize_base_url("http://localhost:8080").is_ok());
4018    }
4019
4020    /// Pin the user-facing wording of the ONNX version-mismatch error.
4021    /// The auto-fix path MUST be listed first because it's the only safe
4022    /// option that doesn't require sudo or risk breaking other apps that
4023    /// link the system library. Regression of any of these strings would
4024    /// either mislead users (system rm before auto-fix) or break the
4025    /// `aft doctor --fix` discovery path.
4026    #[test]
4027    fn ort_mismatch_message_recommends_auto_fix_first() {
4028        let msg =
4029            format_ort_version_mismatch("1.9.0", "/usr/lib/x86_64-linux-gnu/libonnxruntime.so");
4030
4031        // The reported version and path must appear verbatim.
4032        assert!(
4033            msg.contains("v1.9.0"),
4034            "should report detected version: {msg}"
4035        );
4036        assert!(
4037            msg.contains("/usr/lib/x86_64-linux-gnu/libonnxruntime.so"),
4038            "should report system path: {msg}"
4039        );
4040        assert!(msg.contains("v1.20+"), "should state requirement: {msg}");
4041
4042        // Solution ordering: auto-fix is #1, system rm is #2, install is #3.
4043        let auto_fix_pos = msg
4044            .find("Auto-fix")
4045            .expect("Auto-fix solution missing — users won't discover --fix");
4046        let remove_pos = msg
4047            .find("Remove the old library")
4048            .expect("system-rm solution missing");
4049        assert!(
4050            auto_fix_pos < remove_pos,
4051            "Auto-fix must come before manual rm — see PR comment thread"
4052        );
4053
4054        // The auto-fix command must be runnable as-is on a fresh system.
4055        assert!(
4056            msg.contains("npx @cortexkit/aft doctor --fix"),
4057            "auto-fix command must be present and copy-pasteable: {msg}"
4058        );
4059    }
4060
4061    /// macOS dylib paths must not produce a malformed message when the
4062    /// system path lacks a trailing slash. This is a regression guard
4063    /// for the "{}\n{}" format string contract.
4064    #[test]
4065    fn ort_mismatch_message_handles_macos_dylib_path() {
4066        let msg = format_ort_version_mismatch("1.9.0", "/opt/homebrew/lib/libonnxruntime.dylib");
4067        assert!(msg.contains("v1.9.0"));
4068        assert!(msg.contains("/opt/homebrew/lib/libonnxruntime.dylib"));
4069        // The dylib path must appear in the auto-fix paragraph (single
4070        // quotes around it) AND in the manual-rm paragraph; verify
4071        // both placements survived the format string.
4072        assert!(
4073            msg.contains("'/opt/homebrew/lib/libonnxruntime.dylib'"),
4074            "system path should be quoted in the auto-fix sentence: {msg}"
4075        );
4076    }
4077}