Skip to main content

aft/
semantic_index.rs

1use crate::cache_freshness::{self, FileFreshness, FreshnessVerdict};
2use crate::config::{SemanticBackend, SemanticBackendConfig};
3use crate::parser::{detect_language, extract_symbols_from_tree, grammar_for};
4use crate::symbols::{Symbol, SymbolKind};
5use crate::{slog_info, slog_warn};
6
7use fastembed::{EmbeddingModel as FastembedEmbeddingModel, InitOptions, TextEmbedding};
8use rayon::prelude::*;
9use reqwest::blocking::Client;
10use serde::{Deserialize, Serialize};
11use std::collections::{HashMap, HashSet, VecDeque};
12use std::env;
13use std::fmt::Display;
14use std::fs;
15use std::path::{Path, PathBuf};
16use std::time::Duration;
17use std::time::SystemTime;
18use tree_sitter::Parser;
19use url::Url;
20
21const DEFAULT_DIMENSION: usize = 384;
22const MAX_ENTRIES: usize = 1_000_000;
23const MAX_DIMENSION: usize = 1024;
24const F32_BYTES: usize = std::mem::size_of::<f32>();
25const HEADER_BYTES_V1: usize = 9;
26const HEADER_BYTES_V2: usize = 13;
27const ONNX_RUNTIME_INSTALL_HINT: &str =
28    "ONNX Runtime not found. Install via: brew install onnxruntime (macOS) or apt install libonnxruntime (Linux).";
29
30const SEMANTIC_INDEX_VERSION_V1: u8 = 1;
31const SEMANTIC_INDEX_VERSION_V2: u8 = 2;
32/// V3 adds subsec_nanos to the file-mtime table so staleness detection survives
33/// restart round-trips on filesystems with subsecond mtime precision (APFS,
34/// ext4 with nsec, NTFS). V1/V2 persisted whole-second mtimes only, which
35/// caused every restart to flag ~99% of files as stale and re-embed them.
36const SEMANTIC_INDEX_VERSION_V3: u8 = 3;
37/// V4 keeps the V3 on-disk layout but rebuilds persisted snippets once after
38/// fixing symbol ranges that were incorrectly treated as 1-based.
39const SEMANTIC_INDEX_VERSION_V4: u8 = 4;
40/// V5 adds file sizes to the file metadata table so incremental staleness
41/// detection can catch content changes even when mtime precision misses them.
42const SEMANTIC_INDEX_VERSION_V5: u8 = 5;
43/// V6 stores paths relative to project_root and adds content hashes.
44const SEMANTIC_INDEX_VERSION_V6: u8 = 6;
45const DEFAULT_OPENAI_EMBEDDING_PATH: &str = "/embeddings";
46const DEFAULT_OLLAMA_EMBEDDING_PATH: &str = "/api/embed";
47// Must stay below the bridge timeout (30s) to avoid bridge kills on slow backends.
48const DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS: u64 = 25_000;
49const DEFAULT_MAX_BATCH_SIZE: usize = 64;
50const QUERY_EMBEDDING_CACHE_CAP: usize = 1_000;
51const FALLBACK_BACKEND: &str = "none";
52const EMBEDDING_REQUEST_MAX_ATTEMPTS: usize = 3;
53const EMBEDDING_REQUEST_BACKOFF_MS: [u64; 2] = [500, 1_000];
54
55#[derive(Debug, Clone, Serialize, Deserialize)]
56pub struct SemanticIndexFingerprint {
57    pub backend: String,
58    pub model: String,
59    #[serde(default)]
60    pub base_url: String,
61    pub dimension: usize,
62    #[serde(default = "default_chunking_version")]
63    pub chunking_version: u32,
64}
65
66fn default_chunking_version() -> u32 {
67    2
68}
69
70impl SemanticIndexFingerprint {
71    fn from_config(config: &SemanticBackendConfig, dimension: usize) -> Self {
72        // Use normalized URL for fingerprinting so cosmetic differences
73        // (e.g. "http://host/v1" vs "http://host/v1/") don't cause rebuilds.
74        let base_url = config
75            .base_url
76            .as_ref()
77            .and_then(|u| normalize_base_url(u).ok())
78            .unwrap_or_else(|| FALLBACK_BACKEND.to_string());
79        Self {
80            backend: config.backend.as_str().to_string(),
81            model: config.model.clone(),
82            base_url,
83            dimension,
84            chunking_version: default_chunking_version(),
85        }
86    }
87
88    pub fn as_string(&self) -> String {
89        serde_json::to_string(self).unwrap_or_else(|_| String::new())
90    }
91
92    fn matches_expected(&self, expected: &str) -> bool {
93        let encoded = self.as_string();
94        !encoded.is_empty() && encoded == expected
95    }
96}
97
98enum SemanticEmbeddingEngine {
99    Fastembed(TextEmbedding),
100    OpenAiCompatible {
101        client: Client,
102        model: String,
103        base_url: String,
104        api_key: Option<String>,
105    },
106    Ollama {
107        client: Client,
108        model: String,
109        base_url: String,
110    },
111}
112
113pub struct SemanticEmbeddingModel {
114    backend: SemanticBackend,
115    model: String,
116    base_url: Option<String>,
117    timeout_ms: u64,
118    max_batch_size: usize,
119    dimension: Option<usize>,
120    engine: SemanticEmbeddingEngine,
121    query_embedding_cache: HashMap<String, Vec<f32>>,
122    query_embedding_cache_order: VecDeque<String>,
123    query_embedding_cache_hits: u64,
124    query_embedding_cache_misses: u64,
125}
126
127pub type EmbeddingModel = SemanticEmbeddingModel;
128
129fn validate_embedding_batch(
130    vectors: &[Vec<f32>],
131    expected_count: usize,
132    context: &str,
133) -> Result<(), String> {
134    if expected_count > 0 && vectors.is_empty() {
135        return Err(format!(
136            "{context} returned no vectors for {expected_count} inputs"
137        ));
138    }
139
140    if vectors.len() != expected_count {
141        return Err(format!(
142            "{context} returned {} vectors for {} inputs",
143            vectors.len(),
144            expected_count
145        ));
146    }
147
148    let Some(first_vector) = vectors.first() else {
149        return Ok(());
150    };
151    let expected_dimension = first_vector.len();
152    for (index, vector) in vectors.iter().enumerate() {
153        if vector.len() != expected_dimension {
154            return Err(format!(
155                "{context} returned inconsistent embedding dimensions: vector 0 has length {expected_dimension}, vector {index} has length {}",
156                vector.len()
157            ));
158        }
159    }
160
161    Ok(())
162}
163
164/// Normalize a base URL: validate scheme and strip trailing slash.
165/// Does NOT perform SSRF/private-IP validation — call
166/// `validate_base_url_no_ssrf` separately when processing user-supplied config.
167fn normalize_base_url(raw: &str) -> Result<String, String> {
168    let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
169    let scheme = parsed.scheme();
170    if scheme != "http" && scheme != "https" {
171        return Err(format!(
172            "unsupported URL scheme '{}' — only http:// and https:// are allowed",
173            scheme
174        ));
175    }
176    Ok(parsed.to_string().trim_end_matches('/').to_string())
177}
178
179/// Validate that a base URL does not point to a private/loopback address.
180/// Call this on user-supplied config (at configure time) to prevent SSRF.
181/// Not called for programmatically constructed configs (e.g. tests).
182///
183/// **Loopback is allowed.** Self-hosted embedding backends (e.g. Ollama at
184/// `http://127.0.0.1:11434`) are a primary use case for `aft_search`. Loopback
185/// addresses by definition cannot be exploited as SSRF targets — they only
186/// reach services on the same machine. Allowing loopback unblocks Ollama at its
187/// default config without opening up SSRF to LAN/intranet services, which
188/// remain rejected.
189///
190/// **mDNS `.local` is rejected.** mDNS hostnames typically resolve to LAN
191/// devices (printers, homelab servers); rejecting them before DNS lookup keeps
192/// the SSRF guard meaningful for non-loopback private networks.
193pub fn validate_base_url_no_ssrf(raw: &str) -> Result<(), String> {
194    use std::net::{IpAddr, ToSocketAddrs};
195
196    let parsed = Url::parse(raw).map_err(|error| format!("invalid base_url '{raw}': {error}"))?;
197
198    let host = parsed.host_str().unwrap_or("");
199
200    // Loopback hostnames are explicitly allowed. RFC 6761 mandates that
201    // `localhost` and `*.localhost` resolve to loopback;
202    // `localhost.localdomain` is a historical alias used on some Linux
203    // distros. Self-hosted backends like Ollama use these by default.
204    let is_loopback_host =
205        host == "localhost" || host == "localhost.localdomain" || host.ends_with(".localhost");
206    if is_loopback_host {
207        return Ok(());
208    }
209
210    // mDNS hostnames are typically LAN devices, not loopback. Reject before
211    // DNS lookup so users get a clear error rather than a private-IP error.
212    if host.ends_with(".local") {
213        return Err(format!(
214            "base_url host '{host}' is an mDNS name — only loopback (localhost / 127.0.0.1) and public endpoints are allowed"
215        ));
216    }
217
218    // Resolve the hostname. Reject private/link-local/CGNAT IPs but NOT
219    // loopback (which is by definition same-machine and not an SSRF target).
220    let port = parsed.port_or_known_default().unwrap_or(443);
221    let addr_str = format!("{host}:{port}");
222    let addrs: Vec<IpAddr> = addr_str
223        .to_socket_addrs()
224        .map(|iter| iter.map(|sa| sa.ip()).collect())
225        .unwrap_or_default();
226    for ip in &addrs {
227        if is_private_non_loopback_ip(ip) {
228            return Err(format!(
229                "base_url '{raw}' resolves to a private/reserved IP — only loopback (127.0.0.1) and public endpoints are allowed"
230            ));
231        }
232    }
233
234    Ok(())
235}
236
237/// Returns true for IPv4/IPv6 addresses in private/link-local/CGNAT/wildcard
238/// ranges, EXCLUDING loopback (127.0.0.0/8 and ::1). Loopback is considered
239/// safe for SSRF purposes — see [`validate_base_url_no_ssrf`] for rationale.
240fn is_private_non_loopback_ip(ip: &std::net::IpAddr) -> bool {
241    use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
242    match ip {
243        IpAddr::V4(v4) => {
244            let o = v4.octets();
245            // Note: 127.0.0.0/8 (loopback) is intentionally NOT in this set.
246            // 10.0.0.0/8
247            o[0] == 10
248            // 172.16.0.0/12
249            || (o[0] == 172 && (16..=31).contains(&o[1]))
250            // 192.168.0.0/16
251            || (o[0] == 192 && o[1] == 168)
252            // 169.254.0.0/16 link-local
253            || (o[0] == 169 && o[1] == 254)
254            // 100.64.0.0/10 CGNAT
255            || (o[0] == 100 && (64..=127).contains(&o[1]))
256            // 0.0.0.0/8 wildcard
257            || o[0] == 0
258        }
259        IpAddr::V6(v6) => {
260            // Note: ::1 (loopback) is intentionally NOT in this set.
261            let _ = Ipv6Addr::LOCALHOST; // touch to silence unused-import lints in some builds
262                                         // fe80::/10 link-local
263            (v6.segments()[0] & 0xffc0) == 0xfe80
264            // fc00::/7 unique-local
265            || (v6.segments()[0] & 0xfe00) == 0xfc00
266            // ::ffff:0:0/96 IPv4-mapped — check the embedded IPv4
267            || (v6.segments()[0] == 0 && v6.segments()[1] == 0
268                && v6.segments()[2] == 0 && v6.segments()[3] == 0
269                && v6.segments()[4] == 0 && v6.segments()[5] == 0xffff
270                && {
271                    let [a, b] = v6.segments()[6..8] else { return false; };
272                    let ipv4 = Ipv4Addr::new((a >> 8) as u8, (a & 0xff) as u8, (b >> 8) as u8, (b & 0xff) as u8);
273                    is_private_non_loopback_ip(&IpAddr::V4(ipv4))
274                })
275        }
276    }
277}
278
279fn build_openai_embeddings_endpoint(base_url: &str) -> String {
280    if base_url.ends_with("/v1") {
281        format!("{base_url}{DEFAULT_OPENAI_EMBEDDING_PATH}")
282    } else {
283        format!("{base_url}/v1{}", DEFAULT_OPENAI_EMBEDDING_PATH)
284    }
285}
286
287fn build_ollama_embeddings_endpoint(base_url: &str) -> String {
288    if base_url.ends_with("/api") {
289        format!("{base_url}/embed")
290    } else {
291        format!("{base_url}{DEFAULT_OLLAMA_EMBEDDING_PATH}")
292    }
293}
294
295fn normalize_api_key(value: Option<String>) -> Option<String> {
296    value.and_then(|token| {
297        let token = token.trim();
298        if token.is_empty() {
299            None
300        } else {
301            Some(token.to_string())
302        }
303    })
304}
305
306fn is_retryable_embedding_status(status: reqwest::StatusCode) -> bool {
307    status.is_server_error() || status == reqwest::StatusCode::TOO_MANY_REQUESTS
308}
309
310fn is_retryable_embedding_error(error: &reqwest::Error) -> bool {
311    error.is_connect()
312}
313
314fn sleep_before_embedding_retry(attempt_index: usize) {
315    if let Some(delay_ms) = EMBEDDING_REQUEST_BACKOFF_MS.get(attempt_index) {
316        std::thread::sleep(Duration::from_millis(*delay_ms));
317    }
318}
319
320fn send_embedding_request<F>(mut make_request: F, backend_label: &str) -> Result<String, String>
321where
322    F: FnMut() -> reqwest::blocking::RequestBuilder,
323{
324    for attempt_index in 0..EMBEDDING_REQUEST_MAX_ATTEMPTS {
325        let last_attempt = attempt_index + 1 == EMBEDDING_REQUEST_MAX_ATTEMPTS;
326
327        let response = match make_request().send() {
328            Ok(response) => response,
329            Err(error) => {
330                if !last_attempt && is_retryable_embedding_error(&error) {
331                    sleep_before_embedding_retry(attempt_index);
332                    continue;
333                }
334                return Err(format!("{backend_label} request failed: {error}"));
335            }
336        };
337
338        let status = response.status();
339        let raw = match response.text() {
340            Ok(raw) => raw,
341            Err(error) => {
342                if !last_attempt && is_retryable_embedding_error(&error) {
343                    sleep_before_embedding_retry(attempt_index);
344                    continue;
345                }
346                return Err(format!("{backend_label} response read failed: {error}"));
347            }
348        };
349
350        if status.is_success() {
351            return Ok(raw);
352        }
353
354        if !last_attempt && is_retryable_embedding_status(status) {
355            sleep_before_embedding_retry(attempt_index);
356            continue;
357        }
358
359        return Err(format!(
360            "{backend_label} request failed (HTTP {}): {}",
361            status, raw
362        ));
363    }
364
365    unreachable!("embedding request retries exhausted without returning")
366}
367
368impl SemanticEmbeddingModel {
369    pub fn from_config(config: &SemanticBackendConfig) -> Result<Self, String> {
370        let timeout_ms = if config.timeout_ms == 0 {
371            DEFAULT_OPENAI_EMBEDDING_TIMEOUT_MS
372        } else {
373            config.timeout_ms
374        };
375
376        let max_batch_size = if config.max_batch_size == 0 {
377            DEFAULT_MAX_BATCH_SIZE
378        } else {
379            config.max_batch_size
380        };
381
382        let api_key_env = normalize_api_key(config.api_key_env.clone());
383        let model = config.model.clone();
384
385        let client = Client::builder()
386            .timeout(Duration::from_millis(timeout_ms))
387            .redirect(reqwest::redirect::Policy::none())
388            .build()
389            .map_err(|error| format!("failed to configure embedding client: {error}"))?;
390
391        let engine = match config.backend {
392            SemanticBackend::Fastembed => {
393                SemanticEmbeddingEngine::Fastembed(initialize_text_embedding(&model)?)
394            }
395            SemanticBackend::OpenAiCompatible => {
396                let raw = config.base_url.as_ref().ok_or_else(|| {
397                    "base_url is required for openai_compatible backend".to_string()
398                })?;
399                let base_url = normalize_base_url(raw)?;
400
401                let api_key = match api_key_env {
402                    Some(var_name) => Some(env::var(&var_name).map_err(|_| {
403                        format!("missing api_key_env '{var_name}' for openai_compatible backend")
404                    })?),
405                    None => None,
406                };
407
408                SemanticEmbeddingEngine::OpenAiCompatible {
409                    client,
410                    model,
411                    base_url,
412                    api_key,
413                }
414            }
415            SemanticBackend::Ollama => {
416                let raw = config
417                    .base_url
418                    .as_ref()
419                    .ok_or_else(|| "base_url is required for ollama backend".to_string())?;
420                let base_url = normalize_base_url(raw)?;
421
422                SemanticEmbeddingEngine::Ollama {
423                    client,
424                    model,
425                    base_url,
426                }
427            }
428        };
429
430        Ok(Self {
431            backend: config.backend,
432            model: config.model.clone(),
433            base_url: config.base_url.clone(),
434            timeout_ms,
435            max_batch_size,
436            dimension: None,
437            engine,
438            query_embedding_cache: HashMap::new(),
439            query_embedding_cache_order: VecDeque::new(),
440            query_embedding_cache_hits: 0,
441            query_embedding_cache_misses: 0,
442        })
443    }
444
445    pub fn backend(&self) -> SemanticBackend {
446        self.backend
447    }
448
449    pub fn model(&self) -> &str {
450        &self.model
451    }
452
453    pub fn base_url(&self) -> Option<&str> {
454        self.base_url.as_deref()
455    }
456
457    pub fn max_batch_size(&self) -> usize {
458        self.max_batch_size
459    }
460
461    pub fn timeout_ms(&self) -> u64 {
462        self.timeout_ms
463    }
464
465    pub fn fingerprint(
466        &mut self,
467        config: &SemanticBackendConfig,
468    ) -> Result<SemanticIndexFingerprint, String> {
469        let dimension = self.dimension()?;
470        Ok(SemanticIndexFingerprint::from_config(config, dimension))
471    }
472
473    pub fn dimension(&mut self) -> Result<usize, String> {
474        if let Some(dimension) = self.dimension {
475            return Ok(dimension);
476        }
477
478        let dimension = match &mut self.engine {
479            SemanticEmbeddingEngine::Fastembed(model) => {
480                let vectors = model
481                    .embed(vec!["semantic index fingerprint probe".to_string()], None)
482                    .map_err(|error| format_embedding_init_error(error.to_string()))?;
483                vectors
484                    .first()
485                    .map(|v| v.len())
486                    .ok_or_else(|| "embedding backend returned no vectors".to_string())?
487            }
488            SemanticEmbeddingEngine::OpenAiCompatible { .. } => {
489                let vectors =
490                    self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
491                vectors
492                    .first()
493                    .map(|v| v.len())
494                    .ok_or_else(|| "embedding backend returned no vectors".to_string())?
495            }
496            SemanticEmbeddingEngine::Ollama { .. } => {
497                let vectors =
498                    self.embed_texts(vec!["semantic index fingerprint probe".to_string()])?;
499                vectors
500                    .first()
501                    .map(|v| v.len())
502                    .ok_or_else(|| "embedding backend returned no vectors".to_string())?
503            }
504        };
505
506        self.dimension = Some(dimension);
507        Ok(dimension)
508    }
509
510    pub fn embed(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
511        self.embed_texts(texts)
512    }
513
514    pub fn embed_query_cached(&mut self, query: &str) -> Result<Vec<f32>, String> {
515        if let Some(vector) = self.query_embedding_cache.get(query) {
516            self.query_embedding_cache_hits += 1;
517            return Ok(vector.clone());
518        }
519
520        self.query_embedding_cache_misses += 1;
521        let embeddings = self.embed_texts(vec![query.to_string()])?;
522        let vector = embeddings
523            .first()
524            .cloned()
525            .ok_or_else(|| "embedding model returned no query vector".to_string())?;
526
527        if self.query_embedding_cache.len() >= QUERY_EMBEDDING_CACHE_CAP {
528            if let Some(oldest) = self.query_embedding_cache_order.pop_front() {
529                self.query_embedding_cache.remove(&oldest);
530            }
531        }
532        self.query_embedding_cache
533            .insert(query.to_string(), vector.clone());
534        self.query_embedding_cache_order
535            .push_back(query.to_string());
536
537        Ok(vector)
538    }
539
540    pub fn query_embedding_cache_stats(&self) -> (u64, u64, usize) {
541        (
542            self.query_embedding_cache_hits,
543            self.query_embedding_cache_misses,
544            self.query_embedding_cache.len(),
545        )
546    }
547
548    fn embed_texts(&mut self, texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
549        match &mut self.engine {
550            SemanticEmbeddingEngine::Fastembed(model) => model
551                .embed(texts, None::<usize>)
552                .map_err(|error| format_embedding_init_error(error.to_string()))
553                .map_err(|error| format!("failed to embed batch: {error}")),
554            SemanticEmbeddingEngine::OpenAiCompatible {
555                client,
556                model,
557                base_url,
558                api_key,
559            } => {
560                let expected_text_count = texts.len();
561                let endpoint = build_openai_embeddings_endpoint(base_url);
562                let body = serde_json::json!({
563                    "input": texts,
564                    "model": model,
565                });
566
567                let raw = send_embedding_request(
568                    || {
569                        // `.json(&body)` sets Content-Type: application/json
570                        // automatically. Do NOT add `.header("Content-Type",
571                        // "application/json")` afterwards — RequestBuilder::header()
572                        // calls HeaderMap::append, which produces TWO Content-Type
573                        // headers on the wire. OpenAI's /v1/embeddings endpoint
574                        // treats duplicate Content-Type as malformed and rejects
575                        // the body with 400 "you must provide a model parameter"
576                        // even when `model` is set. Verified end-to-end against
577                        // api.openai.com. See issue #36.
578                        let mut request = client.post(&endpoint).json(&body);
579
580                        if let Some(api_key) = api_key {
581                            request = request.header("Authorization", format!("Bearer {api_key}"));
582                        }
583
584                        request
585                    },
586                    "openai compatible",
587                )?;
588
589                #[derive(Deserialize)]
590                struct OpenAiResponse {
591                    data: Vec<OpenAiEmbeddingResult>,
592                }
593
594                #[derive(Deserialize)]
595                struct OpenAiEmbeddingResult {
596                    embedding: Vec<f32>,
597                    index: Option<u32>,
598                }
599
600                let parsed: OpenAiResponse = serde_json::from_str(&raw)
601                    .map_err(|error| format!("invalid openai compatible response: {error}"))?;
602                if parsed.data.len() != expected_text_count {
603                    return Err(format!(
604                        "openai compatible response returned {} embeddings for {} inputs",
605                        parsed.data.len(),
606                        expected_text_count
607                    ));
608                }
609
610                let mut vectors = vec![Vec::new(); parsed.data.len()];
611                for (i, item) in parsed.data.into_iter().enumerate() {
612                    let index = item.index.unwrap_or(i as u32) as usize;
613                    if index >= vectors.len() {
614                        return Err(
615                            "openai compatible response contains invalid vector index".to_string()
616                        );
617                    }
618                    vectors[index] = item.embedding;
619                }
620
621                for vector in &vectors {
622                    if vector.is_empty() {
623                        return Err(
624                            "openai compatible response contained missing vectors".to_string()
625                        );
626                    }
627                }
628
629                self.dimension = vectors.first().map(Vec::len);
630                Ok(vectors)
631            }
632            SemanticEmbeddingEngine::Ollama {
633                client,
634                model,
635                base_url,
636            } => {
637                let expected_text_count = texts.len();
638                let endpoint = build_ollama_embeddings_endpoint(base_url);
639
640                #[derive(Serialize)]
641                struct OllamaPayload<'a> {
642                    model: &'a str,
643                    input: Vec<String>,
644                }
645
646                let payload = OllamaPayload {
647                    model,
648                    input: texts,
649                };
650
651                let raw = send_embedding_request(
652                    || {
653                        // `.json(&payload)` sets Content-Type automatically.
654                        // Same duplicate-header trap as the OpenAI branch above
655                        // — most Ollama servers tolerate it, but the
656                        // single-Content-Type form is the correct one.
657                        client.post(&endpoint).json(&payload)
658                    },
659                    "ollama",
660                )?;
661
662                #[derive(Deserialize)]
663                struct OllamaResponse {
664                    embeddings: Vec<Vec<f32>>,
665                }
666
667                let parsed: OllamaResponse = serde_json::from_str(&raw)
668                    .map_err(|error| format!("invalid ollama response: {error}"))?;
669                if parsed.embeddings.is_empty() {
670                    return Err("ollama response returned no embeddings".to_string());
671                }
672                if parsed.embeddings.len() != expected_text_count {
673                    return Err(format!(
674                        "ollama response returned {} embeddings for {} inputs",
675                        parsed.embeddings.len(),
676                        expected_text_count
677                    ));
678                }
679
680                let vectors = parsed.embeddings;
681                for vector in &vectors {
682                    if vector.is_empty() {
683                        return Err("ollama response contained empty embeddings".to_string());
684                    }
685                }
686
687                self.dimension = vectors.first().map(Vec::len);
688                Ok(vectors)
689            }
690        }
691    }
692}
693
694/// Pre-validate ONNX Runtime by attempting a raw dlopen before ort touches it.
695/// This catches broken/incompatible .so files without risking a panic in the ort crate.
696/// Also checks the runtime version via OrtGetApiBase if available.
697pub fn pre_validate_onnx_runtime() -> Result<(), String> {
698    let dylib_path = std::env::var("ORT_DYLIB_PATH").ok();
699
700    #[cfg(any(target_os = "linux", target_os = "macos"))]
701    {
702        #[cfg(target_os = "linux")]
703        let default_name = "libonnxruntime.so";
704        #[cfg(target_os = "macos")]
705        let default_name = "libonnxruntime.dylib";
706
707        let lib_name = dylib_path.as_deref().unwrap_or(default_name);
708
709        unsafe {
710            let c_name = std::ffi::CString::new(lib_name)
711                .map_err(|e| format!("invalid library path: {}", e))?;
712            let handle = libc::dlopen(c_name.as_ptr(), libc::RTLD_NOW);
713            if handle.is_null() {
714                let err = libc::dlerror();
715                let msg = if err.is_null() {
716                    "unknown dlopen error".to_string()
717                } else {
718                    std::ffi::CStr::from_ptr(err).to_string_lossy().into_owned()
719                };
720                return Err(format!(
721                    "ONNX Runtime not found. dlopen('{}') failed: {}. \
722                     Run `npx @cortexkit/aft doctor` to diagnose.",
723                    lib_name, msg
724                ));
725            }
726
727            // Try to detect the runtime version from the file path or soname.
728            // libonnxruntime.so.1.19.0, libonnxruntime.1.24.4.dylib, etc.
729            let detected_version = detect_ort_version_from_path(lib_name);
730
731            libc::dlclose(handle);
732
733            // Check version compatibility — we need 1.24.x
734            if let Some(ref version) = detected_version {
735                let parts: Vec<&str> = version.split('.').collect();
736                if let (Some(major), Some(minor)) = (
737                    parts.first().and_then(|s| s.parse::<u32>().ok()),
738                    parts.get(1).and_then(|s| s.parse::<u32>().ok()),
739                ) {
740                    if major != 1 || minor < 20 {
741                        return Err(format_ort_version_mismatch(version, lib_name));
742                    }
743                }
744            }
745        }
746    }
747
748    #[cfg(target_os = "windows")]
749    {
750        // On Windows, skip pre-validation — let ort handle LoadLibrary
751        let _ = dylib_path;
752    }
753
754    Ok(())
755}
756
757/// Try to extract the ORT version from the library filename or resolved symlink.
758/// Examples: "libonnxruntime.so.1.19.0" → "1.19.0", "libonnxruntime.1.24.4.dylib" → "1.24.4"
759#[cfg(any(test, target_os = "linux", target_os = "macos"))]
760fn detect_ort_version_from_path(lib_path: &str) -> Option<String> {
761    let path = std::path::Path::new(lib_path);
762
763    // Try the path as given, then follow symlinks
764    for candidate in [Some(path.to_path_buf()), std::fs::canonicalize(path).ok()]
765        .into_iter()
766        .flatten()
767    {
768        if let Some(name) = candidate.file_name().and_then(|n| n.to_str()) {
769            if let Some(version) = extract_version_from_filename(name) {
770                return Some(version);
771            }
772        }
773    }
774
775    // Also check for versioned siblings in the same directory
776    if let Some(parent) = path.parent() {
777        if let Ok(entries) = std::fs::read_dir(parent) {
778            for entry in entries.flatten() {
779                if let Some(name) = entry.file_name().to_str() {
780                    if name.starts_with("libonnxruntime") {
781                        if let Some(version) = extract_version_from_filename(name) {
782                            return Some(version);
783                        }
784                    }
785                }
786            }
787        }
788    }
789
790    None
791}
792
793/// Extract version from filenames like "libonnxruntime.so.1.19.0" or "libonnxruntime.1.24.4.dylib"
794#[cfg(any(test, target_os = "linux", target_os = "macos"))]
795fn extract_version_from_filename(name: &str) -> Option<String> {
796    // Match patterns: .so.X.Y.Z or .X.Y.Z.dylib or .X.Y.Z.so
797    let re = regex::Regex::new(r"(\d+\.\d+\.\d+)").ok()?;
798    re.find(name).map(|m| m.as_str().to_string())
799}
800
801#[cfg(any(test, target_os = "linux", target_os = "macos"))]
802fn suggest_removal_command(lib_path: &str) -> String {
803    if lib_path.starts_with("/usr/local/lib")
804        || lib_path == "libonnxruntime.so"
805        || lib_path == "libonnxruntime.dylib"
806    {
807        #[cfg(target_os = "linux")]
808        return "   sudo rm /usr/local/lib/libonnxruntime* && sudo ldconfig".to_string();
809        #[cfg(target_os = "macos")]
810        return "   sudo rm /usr/local/lib/libonnxruntime*".to_string();
811        #[cfg(target_os = "windows")]
812        return "   Delete the ONNX Runtime DLL from your PATH".to_string();
813    }
814    format!("   rm '{}'", lib_path)
815}
816
817/// Build the user-facing error message for an incompatible ONNX Runtime
818/// install. Extracted as a pure helper so we can unit-test the wording
819/// stability — the auto-fix recommendation must always come first because
820/// it's the only safe option, and the system-rm step must remain present
821/// because some users prefer the system-wide cleanup path.
822#[cfg(any(test, target_os = "linux", target_os = "macos"))]
823pub(crate) fn format_ort_version_mismatch(version: &str, lib_name: &str) -> String {
824    format!(
825        "ONNX Runtime version mismatch: found v{} at '{}', but AFT requires v1.20+. \
826         Solutions:\n\
827         1. Auto-fix (recommended): run `npx @cortexkit/aft doctor --fix`. \
828         This downloads AFT-managed ONNX Runtime v1.24 into AFT's storage and \
829         configures the bridge to load it instead of the system library — no \
830         changes to '{}'.\n\
831         2. Remove the old library and restart (AFT auto-downloads the correct version on next start):\n\
832         {}\n\
833         3. Or install ONNX Runtime 1.24 system-wide: https://github.com/microsoft/onnxruntime/releases/tag/v1.24.0\n\
834         4. Run `npx @cortexkit/aft doctor` for full diagnostics.",
835        version,
836        lib_name,
837        lib_name,
838        suggest_removal_command(lib_name),
839    )
840}
841
842pub fn initialize_text_embedding(model: &str) -> Result<TextEmbedding, String> {
843    // Pre-validate before ort can panic on a bad library
844    pre_validate_onnx_runtime()?;
845
846    let selected_model = match model {
847        "all-MiniLM-L6-v2" | "all-minilm-l6-v2" => FastembedEmbeddingModel::AllMiniLML6V2,
848        _ => {
849            return Err(format!(
850                "unsupported fastembed model '{}'. Supported: all-MiniLM-L6-v2",
851                model
852            ))
853        }
854    };
855
856    TextEmbedding::try_new(InitOptions::new(selected_model)).map_err(format_embedding_init_error)
857}
858
859pub fn is_onnx_runtime_unavailable(message: &str) -> bool {
860    if message.trim_start().starts_with("ONNX Runtime not found.") {
861        return true;
862    }
863
864    let message = message.to_ascii_lowercase();
865    let mentions_onnx_runtime = ["onnx runtime", "onnxruntime", "libonnxruntime"]
866        .iter()
867        .any(|pattern| message.contains(pattern));
868    let mentions_dynamic_load_failure = [
869        "shared library",
870        "dynamic library",
871        "failed to load",
872        "could not load",
873        "unable to load",
874        "dlopen",
875        "loadlibrary",
876        "no such file",
877        "not found",
878    ]
879    .iter()
880    .any(|pattern| message.contains(pattern));
881
882    mentions_onnx_runtime && mentions_dynamic_load_failure
883}
884
885fn format_embedding_init_error(error: impl Display) -> String {
886    let message = error.to_string();
887
888    if is_onnx_runtime_unavailable(&message) {
889        return format!("{ONNX_RUNTIME_INSTALL_HINT} Original error: {message}");
890    }
891
892    format!("failed to initialize semantic embedding model: {message}")
893}
894
895/// A chunk of code ready for embedding — derived from a Symbol with context enrichment
896#[derive(Debug, Clone)]
897pub struct SemanticChunk {
898    /// Absolute file path
899    pub file: PathBuf,
900    /// Symbol name
901    pub name: String,
902    /// Symbol kind (function, class, struct, etc.)
903    pub kind: SymbolKind,
904    /// Line range (0-based internally, inclusive)
905    pub start_line: u32,
906    pub end_line: u32,
907    /// Whether the symbol is exported
908    pub exported: bool,
909    /// The enriched text that gets embedded (scope + signature + body snippet)
910    pub embed_text: String,
911    /// Short code snippet for display in results
912    pub snippet: String,
913}
914
915/// A stored embedding entry — chunk metadata + vector
916#[derive(Debug)]
917struct EmbeddingEntry {
918    chunk: SemanticChunk,
919    vector: Vec<f32>,
920}
921
922/// The semantic index — stores embeddings for all symbols in a project
923#[derive(Debug)]
924pub struct SemanticIndex {
925    entries: Vec<EmbeddingEntry>,
926    /// Track which files are indexed and their mtime for staleness detection
927    file_mtimes: HashMap<PathBuf, SystemTime>,
928    /// Track indexed file sizes alongside mtimes for staleness detection
929    file_sizes: HashMap<PathBuf, u64>,
930    file_hashes: HashMap<PathBuf, blake3::Hash>,
931    /// Embedding dimension (384 for MiniLM-L6-v2)
932    dimension: usize,
933    fingerprint: Option<SemanticIndexFingerprint>,
934    project_root: PathBuf,
935}
936
937#[derive(Debug, Clone, Copy)]
938struct IndexedFileMetadata {
939    mtime: SystemTime,
940    size: u64,
941    content_hash: blake3::Hash,
942}
943
944/// Result of an incremental refresh of the semantic index. Counts are file
945/// counts; `total_processed` is the number of current/deleted files considered.
946#[derive(Debug, Default, Clone, Copy)]
947pub struct RefreshSummary {
948    pub changed: usize,
949    pub added: usize,
950    pub deleted: usize,
951    pub total_processed: usize,
952}
953
954impl RefreshSummary {
955    /// True when no files were touched.
956    pub fn is_noop(&self) -> bool {
957        self.changed == 0 && self.added == 0 && self.deleted == 0
958    }
959}
960
961/// Search result from a semantic query
962#[derive(Debug, Clone)]
963pub struct SemanticResult {
964    pub file: PathBuf,
965    pub name: String,
966    pub kind: SymbolKind,
967    pub start_line: u32,
968    pub end_line: u32,
969    pub exported: bool,
970    pub snippet: String,
971    pub score: f32,
972    pub source: &'static str,
973}
974
975impl SemanticIndex {
976    pub fn new(project_root: PathBuf, dimension: usize) -> Self {
977        debug_assert!(project_root.is_absolute());
978        Self {
979            entries: Vec::new(),
980            file_mtimes: HashMap::new(),
981            file_sizes: HashMap::new(),
982            file_hashes: HashMap::new(),
983            dimension,
984            fingerprint: None,
985            project_root,
986        }
987    }
988
989    /// Number of embedded symbol entries.
990    pub fn entry_count(&self) -> usize {
991        self.entries.len()
992    }
993
994    /// Human-readable status label for the index.
995    pub fn status_label(&self) -> &'static str {
996        if self.entries.is_empty() {
997            "empty"
998        } else {
999            "ready"
1000        }
1001    }
1002
1003    fn collect_chunks(
1004        project_root: &Path,
1005        files: &[PathBuf],
1006    ) -> (Vec<SemanticChunk>, HashMap<PathBuf, IndexedFileMetadata>) {
1007        let per_file: Vec<(
1008            PathBuf,
1009            Result<(IndexedFileMetadata, Vec<SemanticChunk>), String>,
1010        )> = files
1011            .par_iter()
1012            .map_init(HashMap::new, |parsers, file| {
1013                let result = collect_file_metadata(file).and_then(|metadata| {
1014                    collect_file_chunks(project_root, file, parsers)
1015                        .map(|chunks| (metadata, chunks))
1016                });
1017                (file.clone(), result)
1018            })
1019            .collect();
1020
1021        let mut chunks: Vec<SemanticChunk> = Vec::new();
1022        let mut file_metadata: HashMap<PathBuf, IndexedFileMetadata> = HashMap::new();
1023
1024        for (file, result) in per_file {
1025            match result {
1026                Ok((metadata, file_chunks)) => {
1027                    file_metadata.insert(file, metadata);
1028                    chunks.extend(file_chunks);
1029                }
1030                Err(error) => {
1031                    // "unsupported file extension" is expected for non-code files
1032                    // (json, xml, .gitignore, etc.) that get included in the
1033                    // project walk. Pre-fix this was swallowed by .unwrap_or_default();
1034                    // we now skip silently to keep the log clean. Only real read/parse
1035                    // errors are worth surfacing.
1036                    if error == "unsupported file extension" {
1037                        continue;
1038                    }
1039                    slog_warn!(
1040                        "failed to collect semantic chunks for {}: {}",
1041                        file.display(),
1042                        error
1043                    );
1044                }
1045            }
1046        }
1047
1048        (chunks, file_metadata)
1049    }
1050
1051    fn build_from_chunks<F, P>(
1052        project_root: &Path,
1053        chunks: Vec<SemanticChunk>,
1054        file_metadata: HashMap<PathBuf, IndexedFileMetadata>,
1055        embed_fn: &mut F,
1056        max_batch_size: usize,
1057        mut progress: Option<&mut P>,
1058    ) -> Result<Self, String>
1059    where
1060        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1061        P: FnMut(usize, usize),
1062    {
1063        debug_assert!(project_root.is_absolute());
1064        let total_chunks = chunks.len();
1065
1066        if chunks.is_empty() {
1067            return Ok(Self {
1068                entries: Vec::new(),
1069                file_mtimes: file_metadata
1070                    .iter()
1071                    .map(|(path, metadata)| (path.clone(), metadata.mtime))
1072                    .collect(),
1073                file_sizes: file_metadata
1074                    .iter()
1075                    .map(|(path, metadata)| (path.clone(), metadata.size))
1076                    .collect(),
1077                file_hashes: file_metadata
1078                    .into_iter()
1079                    .map(|(path, metadata)| (path, metadata.content_hash))
1080                    .collect(),
1081                dimension: DEFAULT_DIMENSION,
1082                fingerprint: None,
1083                project_root: project_root.to_path_buf(),
1084            });
1085        }
1086
1087        // Embed in batches
1088        let mut entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1089        let mut expected_dimension: Option<usize> = None;
1090        let batch_size = max_batch_size.max(1);
1091        for batch_start in (0..chunks.len()).step_by(batch_size) {
1092            let batch_end = (batch_start + batch_size).min(chunks.len());
1093            let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1094                .iter()
1095                .map(|c| c.embed_text.clone())
1096                .collect();
1097
1098            let vectors = embed_fn(batch_texts)?;
1099            validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1100
1101            // Track consistent dimension across all batches
1102            if let Some(dim) = vectors.first().map(|v| v.len()) {
1103                match expected_dimension {
1104                    None => expected_dimension = Some(dim),
1105                    Some(expected) if dim != expected => {
1106                        return Err(format!(
1107                            "embedding dimension changed across batches: expected {expected}, got {dim}"
1108                        ));
1109                    }
1110                    _ => {}
1111                }
1112            }
1113
1114            for (i, vector) in vectors.into_iter().enumerate() {
1115                let chunk_idx = batch_start + i;
1116                entries.push(EmbeddingEntry {
1117                    chunk: chunks[chunk_idx].clone(),
1118                    vector,
1119                });
1120            }
1121
1122            if let Some(callback) = progress.as_mut() {
1123                callback(entries.len(), total_chunks);
1124            }
1125        }
1126
1127        let dimension = entries
1128            .first()
1129            .map(|e| e.vector.len())
1130            .unwrap_or(DEFAULT_DIMENSION);
1131
1132        Ok(Self {
1133            entries,
1134            file_mtimes: file_metadata
1135                .iter()
1136                .map(|(path, metadata)| (path.clone(), metadata.mtime))
1137                .collect(),
1138            file_sizes: file_metadata
1139                .iter()
1140                .map(|(path, metadata)| (path.clone(), metadata.size))
1141                .collect(),
1142            file_hashes: file_metadata
1143                .into_iter()
1144                .map(|(path, metadata)| (path, metadata.content_hash))
1145                .collect(),
1146            dimension,
1147            fingerprint: None,
1148            project_root: project_root.to_path_buf(),
1149        })
1150    }
1151
1152    /// Build the semantic index from a set of files using the provided embedding function.
1153    /// `embed_fn` takes a batch of texts and returns a batch of embedding vectors.
1154    pub fn build<F>(
1155        project_root: &Path,
1156        files: &[PathBuf],
1157        embed_fn: &mut F,
1158        max_batch_size: usize,
1159    ) -> Result<Self, String>
1160    where
1161        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1162    {
1163        let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1164        Self::build_from_chunks(
1165            project_root,
1166            chunks,
1167            file_mtimes,
1168            embed_fn,
1169            max_batch_size,
1170            Option::<&mut fn(usize, usize)>::None,
1171        )
1172    }
1173
1174    /// Build the semantic index and report embedding progress using entry counts.
1175    pub fn build_with_progress<F, P>(
1176        project_root: &Path,
1177        files: &[PathBuf],
1178        embed_fn: &mut F,
1179        max_batch_size: usize,
1180        progress: &mut P,
1181    ) -> Result<Self, String>
1182    where
1183        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1184        P: FnMut(usize, usize),
1185    {
1186        let (chunks, file_mtimes) = Self::collect_chunks(project_root, files);
1187        let total_chunks = chunks.len();
1188        progress(0, total_chunks);
1189        Self::build_from_chunks(
1190            project_root,
1191            chunks,
1192            file_mtimes,
1193            embed_fn,
1194            max_batch_size,
1195            Some(progress),
1196        )
1197    }
1198
1199    /// Incrementally refresh entries for changed/new files only, preserving cached
1200    /// embeddings for unchanged files. Used when loading the index from disk and
1201    /// finding that a small fraction of files have moved on, deleted, or appeared.
1202    ///
1203    /// Returns `RefreshSummary` describing what changed. On success, `self` is
1204    /// mutated in place and remains a valid index.
1205    ///
1206    /// `current_files` is the full set of files the project considers indexable
1207    /// (typically `walk_project_files(...)`). Files in the cache that are no
1208    /// longer in this set are treated as deleted.
1209    pub fn refresh_stale_files<F, P>(
1210        &mut self,
1211        project_root: &Path,
1212        current_files: &[PathBuf],
1213        embed_fn: &mut F,
1214        max_batch_size: usize,
1215        progress: &mut P,
1216    ) -> Result<RefreshSummary, String>
1217    where
1218        F: FnMut(Vec<String>) -> Result<Vec<Vec<f32>>, String>,
1219        P: FnMut(usize, usize),
1220    {
1221        self.backfill_missing_file_sizes();
1222
1223        // 1. Bucket files into deleted / changed / added.
1224        let current_set: HashSet<&Path> = current_files.iter().map(PathBuf::as_path).collect();
1225        let total_processed = current_set.len() + self.file_mtimes.len()
1226            - self
1227                .file_mtimes
1228                .keys()
1229                .filter(|path| current_set.contains(path.as_path()))
1230                .count();
1231
1232        // Files in cache that disappeared from disk OR are no longer in the
1233        // walked set. Both cases need their entries dropped.
1234        let mut deleted: Vec<PathBuf> = Vec::new();
1235        let mut changed: Vec<PathBuf> = Vec::new();
1236        let indexed_paths: Vec<PathBuf> = self.file_mtimes.keys().cloned().collect();
1237        for indexed_path in &indexed_paths {
1238            if !current_set.contains(indexed_path.as_path()) {
1239                deleted.push(indexed_path.clone());
1240                continue;
1241            }
1242            let cached = match (
1243                self.file_mtimes.get(indexed_path),
1244                self.file_sizes.get(indexed_path),
1245                self.file_hashes.get(indexed_path),
1246            ) {
1247                (Some(mtime), Some(size), Some(hash)) => Some(FileFreshness {
1248                    mtime: *mtime,
1249                    size: *size,
1250                    content_hash: *hash,
1251                }),
1252                _ => None,
1253            };
1254            match cached.map(|freshness| cache_freshness::verify_file(indexed_path, &freshness)) {
1255                Some(FreshnessVerdict::HotFresh) => {}
1256                Some(FreshnessVerdict::ContentFresh {
1257                    new_mtime,
1258                    new_size,
1259                }) => {
1260                    self.file_mtimes.insert(indexed_path.clone(), new_mtime);
1261                    self.file_sizes.insert(indexed_path.clone(), new_size);
1262                }
1263                Some(FreshnessVerdict::Stale | FreshnessVerdict::Deleted) | None => {
1264                    changed.push(indexed_path.clone());
1265                }
1266            }
1267        }
1268
1269        // Files in walk that were never indexed.
1270        let mut added: Vec<PathBuf> = Vec::new();
1271        for path in current_files {
1272            if !self.file_mtimes.contains_key(path) {
1273                added.push(path.clone());
1274            }
1275        }
1276
1277        // Fast path: nothing to do.
1278        if deleted.is_empty() && changed.is_empty() && added.is_empty() {
1279            progress(0, 0);
1280            return Ok(RefreshSummary {
1281                total_processed,
1282                ..RefreshSummary::default()
1283            });
1284        }
1285
1286        // 2. Drop entries for deleted files immediately. Changed files are only
1287        //    replaced after successful re-extraction + embedding so transient
1288        //    read/parse errors keep the stale-but-valid cache entry.
1289        if !deleted.is_empty() {
1290            let deleted_set: HashSet<&Path> = deleted.iter().map(PathBuf::as_path).collect();
1291            self.entries
1292                .retain(|entry| !deleted_set.contains(entry.chunk.file.as_path()));
1293            for path in &deleted {
1294                self.file_mtimes.remove(path);
1295                self.file_sizes.remove(path);
1296                self.file_hashes.remove(path);
1297            }
1298        }
1299
1300        // 3. Embed the changed + added set, if any.
1301        let mut to_embed: Vec<PathBuf> = Vec::with_capacity(changed.len() + added.len());
1302        to_embed.extend(changed.iter().cloned());
1303        to_embed.extend(added.iter().cloned());
1304
1305        if to_embed.is_empty() {
1306            // Only deletions happened.
1307            progress(0, 0);
1308            return Ok(RefreshSummary {
1309                changed: 0,
1310                added: 0,
1311                deleted: deleted.len(),
1312                total_processed,
1313            });
1314        }
1315
1316        let (chunks, fresh_metadata) = Self::collect_chunks(project_root, &to_embed);
1317
1318        if chunks.is_empty() {
1319            progress(0, 0);
1320            let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1321            if !successful_files.is_empty() {
1322                self.entries
1323                    .retain(|entry| !successful_files.contains(&entry.chunk.file));
1324            }
1325            let changed_count = changed
1326                .iter()
1327                .filter(|path| successful_files.contains(*path))
1328                .count();
1329            let added_count = added
1330                .iter()
1331                .filter(|path| successful_files.contains(*path))
1332                .count();
1333            for (file, metadata) in fresh_metadata {
1334                self.file_mtimes.insert(file.clone(), metadata.mtime);
1335                self.file_sizes.insert(file.clone(), metadata.size);
1336                self.file_hashes.insert(file.clone(), metadata.content_hash);
1337            }
1338            return Ok(RefreshSummary {
1339                changed: changed_count,
1340                added: added_count,
1341                deleted: deleted.len(),
1342                total_processed,
1343            });
1344        }
1345
1346        // 4. Embed in batches and dimension-check against the existing index.
1347        let total_chunks = chunks.len();
1348        progress(0, total_chunks);
1349        let batch_size = max_batch_size.max(1);
1350        let existing_dimension = if self.entries.is_empty() {
1351            None
1352        } else {
1353            Some(self.dimension)
1354        };
1355        let mut new_entries: Vec<EmbeddingEntry> = Vec::with_capacity(chunks.len());
1356        let mut observed_dimension: Option<usize> = existing_dimension;
1357
1358        for batch_start in (0..chunks.len()).step_by(batch_size) {
1359            let batch_end = (batch_start + batch_size).min(chunks.len());
1360            let batch_texts: Vec<String> = chunks[batch_start..batch_end]
1361                .iter()
1362                .map(|c| c.embed_text.clone())
1363                .collect();
1364
1365            let vectors = embed_fn(batch_texts)?;
1366            validate_embedding_batch(&vectors, batch_end - batch_start, "embedding backend")?;
1367
1368            if let Some(dim) = vectors.first().map(|v| v.len()) {
1369                match observed_dimension {
1370                    None => observed_dimension = Some(dim),
1371                    Some(expected) if dim != expected => {
1372                        // Refuse to mix dimensions in one index. Caller should
1373                        // fall back to a full rebuild.
1374                        return Err(format!(
1375                            "embedding dimension changed during incremental refresh: \
1376                             cached index uses {expected}, new vectors use {dim}"
1377                        ));
1378                    }
1379                    _ => {}
1380                }
1381            }
1382
1383            for (i, vector) in vectors.into_iter().enumerate() {
1384                let chunk_idx = batch_start + i;
1385                new_entries.push(EmbeddingEntry {
1386                    chunk: chunks[chunk_idx].clone(),
1387                    vector,
1388                });
1389            }
1390
1391            progress(new_entries.len(), total_chunks);
1392        }
1393
1394        let successful_files: HashSet<PathBuf> = fresh_metadata.keys().cloned().collect();
1395        if !successful_files.is_empty() {
1396            self.entries
1397                .retain(|entry| !successful_files.contains(&entry.chunk.file));
1398        }
1399
1400        self.entries.extend(new_entries);
1401        for (file, metadata) in fresh_metadata {
1402            self.file_mtimes.insert(file.clone(), metadata.mtime);
1403            self.file_sizes.insert(file.clone(), metadata.size);
1404            self.file_hashes.insert(file, metadata.content_hash);
1405        }
1406        if let Some(dim) = observed_dimension {
1407            self.dimension = dim;
1408        }
1409
1410        Ok(RefreshSummary {
1411            changed: changed
1412                .iter()
1413                .filter(|path| successful_files.contains(*path))
1414                .count(),
1415            added: added
1416                .iter()
1417                .filter(|path| successful_files.contains(*path))
1418                .count(),
1419            deleted: deleted.len(),
1420            total_processed,
1421        })
1422    }
1423
1424    /// Search the index with a query embedding, returning top-K results sorted by relevance
1425    pub fn search(&self, query_vector: &[f32], top_k: usize) -> Vec<SemanticResult> {
1426        if self.entries.is_empty() || query_vector.len() != self.dimension {
1427            return Vec::new();
1428        }
1429
1430        let mut scored: Vec<(f32, usize)> = self
1431            .entries
1432            .iter()
1433            .enumerate()
1434            .map(|(i, entry)| {
1435                let mut score = cosine_similarity(query_vector, &entry.vector);
1436                if entry.chunk.exported {
1437                    score *= 1.1;
1438                }
1439                (score, i)
1440            })
1441            .collect();
1442
1443        // Sort descending by score
1444        scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
1445
1446        scored
1447            .into_iter()
1448            .take(top_k)
1449            // Keep the sort → take → map ordering explicit: removing the old
1450            // `> 0.0` floor cannot evict positive hits because top_k has already
1451            // been selected, but it can surface zero-score noise in the tail.
1452            .map(|(score, idx)| {
1453                let entry = &self.entries[idx];
1454                SemanticResult {
1455                    file: entry.chunk.file.clone(),
1456                    name: entry.chunk.name.clone(),
1457                    kind: entry.chunk.kind.clone(),
1458                    start_line: entry.chunk.start_line,
1459                    end_line: entry.chunk.end_line,
1460                    exported: entry.chunk.exported,
1461                    snippet: entry.chunk.snippet.clone(),
1462                    score,
1463                    source: "semantic",
1464                }
1465            })
1466            .collect()
1467    }
1468
1469    /// Number of indexed entries
1470    pub fn len(&self) -> usize {
1471        self.entries.len()
1472    }
1473
1474    /// Check if a file needs re-indexing based on mtime/size
1475    pub fn is_file_stale(&self, file: &Path) -> bool {
1476        let Some(stored_mtime) = self.file_mtimes.get(file) else {
1477            return true;
1478        };
1479        let Some(stored_size) = self.file_sizes.get(file) else {
1480            return true;
1481        };
1482        let Some(stored_hash) = self.file_hashes.get(file) else {
1483            return true;
1484        };
1485        let cached = FileFreshness {
1486            mtime: *stored_mtime,
1487            size: *stored_size,
1488            content_hash: *stored_hash,
1489        };
1490        match cache_freshness::verify_file(file, &cached) {
1491            FreshnessVerdict::HotFresh => false,
1492            FreshnessVerdict::ContentFresh { .. } => false,
1493            FreshnessVerdict::Stale | FreshnessVerdict::Deleted => true,
1494        }
1495    }
1496
1497    fn backfill_missing_file_sizes(&mut self) {
1498        for path in self.file_mtimes.keys() {
1499            if self.file_sizes.contains_key(path) {
1500                continue;
1501            }
1502            if let Ok(metadata) = fs::metadata(path) {
1503                self.file_sizes.insert(path.clone(), metadata.len());
1504                if let Ok(Some(hash)) = cache_freshness::hash_file_if_small(path, metadata.len()) {
1505                    self.file_hashes.insert(path.clone(), hash);
1506                }
1507            }
1508        }
1509    }
1510
1511    /// Remove entries for a specific file
1512    pub fn remove_file(&mut self, file: &Path) {
1513        self.invalidate_file(file);
1514    }
1515
1516    pub fn invalidate_file(&mut self, file: &Path) {
1517        self.entries.retain(|e| e.chunk.file != file);
1518        self.file_mtimes.remove(file);
1519        self.file_sizes.remove(file);
1520        self.file_hashes.remove(file);
1521    }
1522
1523    /// Get the embedding dimension
1524    pub fn dimension(&self) -> usize {
1525        self.dimension
1526    }
1527
1528    pub fn fingerprint(&self) -> Option<&SemanticIndexFingerprint> {
1529        self.fingerprint.as_ref()
1530    }
1531
1532    pub fn backend_label(&self) -> Option<&str> {
1533        self.fingerprint.as_ref().map(|f| f.backend.as_str())
1534    }
1535
1536    pub fn model_label(&self) -> Option<&str> {
1537        self.fingerprint.as_ref().map(|f| f.model.as_str())
1538    }
1539
1540    pub fn set_fingerprint(&mut self, fingerprint: SemanticIndexFingerprint) {
1541        self.fingerprint = Some(fingerprint);
1542    }
1543
1544    /// Write the semantic index to disk using atomic temp+rename pattern
1545    pub fn write_to_disk(&self, storage_dir: &Path, project_key: &str) {
1546        // Don't persist empty indexes — they would be loaded on next startup
1547        // and prevent a fresh build that might find files.
1548        if self.entries.is_empty() {
1549            slog_info!("skipping semantic index persistence (0 entries)");
1550            return;
1551        }
1552        let dir = storage_dir.join("semantic").join(project_key);
1553        if let Err(e) = fs::create_dir_all(&dir) {
1554            slog_warn!("failed to create semantic cache dir: {}", e);
1555            return;
1556        }
1557        let data_path = dir.join("semantic.bin");
1558        let tmp_path = dir.join(format!(
1559            "semantic.bin.tmp.{}.{}",
1560            std::process::id(),
1561            SystemTime::now()
1562                .duration_since(SystemTime::UNIX_EPOCH)
1563                .unwrap_or(Duration::ZERO)
1564                .as_nanos()
1565        ));
1566        let bytes = self.to_bytes();
1567        let write_result = (|| -> std::io::Result<()> {
1568            use std::io::Write;
1569            let mut file = fs::File::create(&tmp_path)?;
1570            file.write_all(&bytes)?;
1571            file.sync_all()?;
1572            Ok(())
1573        })();
1574        if let Err(e) = write_result {
1575            slog_warn!("failed to write semantic index: {}", e);
1576            let _ = fs::remove_file(&tmp_path);
1577            return;
1578        }
1579        if let Err(e) = fs::rename(&tmp_path, &data_path) {
1580            slog_warn!("failed to rename semantic index: {}", e);
1581            let _ = fs::remove_file(&tmp_path);
1582            return;
1583        }
1584        slog_info!(
1585            "semantic index persisted: {} entries, {:.1} KB",
1586            self.entries.len(),
1587            bytes.len() as f64 / 1024.0
1588        );
1589    }
1590
1591    /// Read the semantic index from disk
1592    pub fn read_from_disk(
1593        storage_dir: &Path,
1594        project_key: &str,
1595        current_canonical_root: &Path,
1596        is_worktree_bridge: bool,
1597        expected_fingerprint: Option<&str>,
1598    ) -> Option<Self> {
1599        debug_assert!(current_canonical_root.is_absolute());
1600        let data_path = storage_dir
1601            .join("semantic")
1602            .join(project_key)
1603            .join("semantic.bin");
1604        let file_len = usize::try_from(fs::metadata(&data_path).ok()?.len()).ok()?;
1605        if file_len < HEADER_BYTES_V1 {
1606            slog_warn!(
1607                "corrupt semantic index (too small: {} bytes), removing",
1608                file_len
1609            );
1610            if !is_worktree_bridge {
1611                let _ = fs::remove_file(&data_path);
1612            }
1613            return None;
1614        }
1615
1616        let bytes = fs::read(&data_path).ok()?;
1617        let version = bytes[0];
1618        if version != SEMANTIC_INDEX_VERSION_V6 {
1619            slog_info!(
1620                "cached semantic index version {} is older than {}, rebuilding",
1621                version,
1622                SEMANTIC_INDEX_VERSION_V6
1623            );
1624            if !is_worktree_bridge {
1625                let _ = fs::remove_file(&data_path);
1626            }
1627            return None;
1628        }
1629        match Self::from_bytes(&bytes, current_canonical_root) {
1630            Ok(index) => {
1631                if index.entries.is_empty() {
1632                    slog_info!("cached semantic index is empty, will rebuild");
1633                    if !is_worktree_bridge {
1634                        let _ = fs::remove_file(&data_path);
1635                    }
1636                    return None;
1637                }
1638                if let Some(expected) = expected_fingerprint {
1639                    let matches = index
1640                        .fingerprint()
1641                        .map(|fingerprint| fingerprint.matches_expected(expected))
1642                        .unwrap_or(false);
1643                    if !matches {
1644                        slog_info!("cached semantic index fingerprint mismatch, rebuilding");
1645                        if !is_worktree_bridge {
1646                            let _ = fs::remove_file(&data_path);
1647                        }
1648                        return None;
1649                    }
1650                }
1651                slog_info!(
1652                    "loaded semantic index from disk: {} entries",
1653                    index.entries.len()
1654                );
1655                Some(index)
1656            }
1657            Err(e) => {
1658                slog_warn!("corrupt semantic index, rebuilding: {}", e);
1659                if !is_worktree_bridge {
1660                    let _ = fs::remove_file(&data_path);
1661                }
1662                None
1663            }
1664        }
1665    }
1666
1667    /// Serialize the index to bytes for disk persistence
1668    pub fn to_bytes(&self) -> Vec<u8> {
1669        let mut buf = Vec::new();
1670        let fingerprint_bytes = self.fingerprint.as_ref().and_then(|fingerprint| {
1671            let encoded = fingerprint.as_string();
1672            if encoded.is_empty() {
1673                None
1674            } else {
1675                Some(encoded.into_bytes())
1676            }
1677        });
1678
1679        // Header: version(1) + dimension(4) + entry_count(4) + fingerprint_len(4) + fingerprint
1680        //
1681        // V6 is the single write format. Layout extends V5:
1682        //   - fingerprint is always represented (absent ⇒ fingerprint_len=0,
1683        //     no bytes follow). Uniform format simplifies the reader.
1684        //   - paths are relative to project_root.
1685        //   - file metadata stored as secs(u64) + subsec_nanos(u32) + size(u64) + blake3(32).
1686        //     Preserves full APFS/ext4/NTFS precision and catches mtime ties.
1687        //
1688        // V1/V2 remain readable for backward compatibility (see from_bytes).
1689        // V3/V4 load as compatible formats but are rejected on disk so snippets
1690        // and file sizes are rebuilt once.
1691        let version = SEMANTIC_INDEX_VERSION_V6;
1692        buf.push(version);
1693        buf.extend_from_slice(&(self.dimension as u32).to_le_bytes());
1694        buf.extend_from_slice(&(self.entries.len() as u32).to_le_bytes());
1695        let fp_bytes_ref: &[u8] = fingerprint_bytes.as_deref().unwrap_or(&[]);
1696        buf.extend_from_slice(&(fp_bytes_ref.len() as u32).to_le_bytes());
1697        buf.extend_from_slice(fp_bytes_ref);
1698
1699        // File mtime table: count(4) + entries
1700        // V3 layout per entry: path_len(4) + path + secs(8) + subsec_nanos(4)
1701        buf.extend_from_slice(&(self.file_mtimes.len() as u32).to_le_bytes());
1702        for (path, mtime) in &self.file_mtimes {
1703            let relative = path
1704                .strip_prefix(&self.project_root)
1705                .unwrap_or(path.as_path());
1706            let path_bytes = relative.to_string_lossy().as_bytes().to_vec();
1707            buf.extend_from_slice(&(path_bytes.len() as u32).to_le_bytes());
1708            buf.extend_from_slice(&path_bytes);
1709            let duration = mtime
1710                .duration_since(SystemTime::UNIX_EPOCH)
1711                .unwrap_or_default();
1712            buf.extend_from_slice(&duration.as_secs().to_le_bytes());
1713            buf.extend_from_slice(&duration.subsec_nanos().to_le_bytes());
1714            let size = self.file_sizes.get(path).copied().unwrap_or_default();
1715            buf.extend_from_slice(&size.to_le_bytes());
1716            let hash = self
1717                .file_hashes
1718                .get(path)
1719                .copied()
1720                .unwrap_or_else(cache_freshness::zero_hash);
1721            buf.extend_from_slice(hash.as_bytes());
1722        }
1723
1724        // Entries: each is metadata + vector
1725        for entry in &self.entries {
1726            let c = &entry.chunk;
1727
1728            // File path
1729            let relative = c
1730                .file
1731                .strip_prefix(&self.project_root)
1732                .unwrap_or(c.file.as_path());
1733            let file_bytes = relative.to_string_lossy().as_bytes().to_vec();
1734            buf.extend_from_slice(&(file_bytes.len() as u32).to_le_bytes());
1735            buf.extend_from_slice(&file_bytes);
1736
1737            // Name
1738            let name_bytes = c.name.as_bytes();
1739            buf.extend_from_slice(&(name_bytes.len() as u32).to_le_bytes());
1740            buf.extend_from_slice(name_bytes);
1741
1742            // Kind (1 byte)
1743            buf.push(symbol_kind_to_u8(&c.kind));
1744
1745            // Lines + exported
1746            buf.extend_from_slice(&(c.start_line as u32).to_le_bytes());
1747            buf.extend_from_slice(&(c.end_line as u32).to_le_bytes());
1748            buf.push(c.exported as u8);
1749
1750            // Snippet
1751            let snippet_bytes = c.snippet.as_bytes();
1752            buf.extend_from_slice(&(snippet_bytes.len() as u32).to_le_bytes());
1753            buf.extend_from_slice(snippet_bytes);
1754
1755            // Embed text
1756            let embed_bytes = c.embed_text.as_bytes();
1757            buf.extend_from_slice(&(embed_bytes.len() as u32).to_le_bytes());
1758            buf.extend_from_slice(embed_bytes);
1759
1760            // Vector (f32 array)
1761            for &val in &entry.vector {
1762                buf.extend_from_slice(&val.to_le_bytes());
1763            }
1764        }
1765
1766        buf
1767    }
1768
1769    /// Deserialize the index from bytes
1770    pub fn from_bytes(data: &[u8], current_canonical_root: &Path) -> Result<Self, String> {
1771        debug_assert!(current_canonical_root.is_absolute());
1772        let mut pos = 0;
1773
1774        if data.len() < HEADER_BYTES_V1 {
1775            return Err("data too short".to_string());
1776        }
1777
1778        let version = data[pos];
1779        pos += 1;
1780        if version != SEMANTIC_INDEX_VERSION_V1
1781            && version != SEMANTIC_INDEX_VERSION_V2
1782            && version != SEMANTIC_INDEX_VERSION_V3
1783            && version != SEMANTIC_INDEX_VERSION_V4
1784            && version != SEMANTIC_INDEX_VERSION_V5
1785            && version != SEMANTIC_INDEX_VERSION_V6
1786        {
1787            return Err(format!("unsupported version: {}", version));
1788        }
1789        // V2 and newer share the same header layout (V3/V4/V5 only differ from
1790        // V2 in the per-mtime entry layout): version(1) + dimension(4) +
1791        // entry_count(4) + fingerprint_len(4) + fingerprint bytes.
1792        if (version == SEMANTIC_INDEX_VERSION_V2
1793            || version == SEMANTIC_INDEX_VERSION_V3
1794            || version == SEMANTIC_INDEX_VERSION_V4
1795            || version == SEMANTIC_INDEX_VERSION_V5
1796            || version == SEMANTIC_INDEX_VERSION_V6)
1797            && data.len() < HEADER_BYTES_V2
1798        {
1799            return Err("data too short for semantic index v2/v3/v4/v5/v6 header".to_string());
1800        }
1801
1802        let dimension = read_u32(data, &mut pos)? as usize;
1803        let entry_count = read_u32(data, &mut pos)? as usize;
1804        if dimension == 0 || dimension > MAX_DIMENSION {
1805            return Err(format!("invalid embedding dimension: {}", dimension));
1806        }
1807        if entry_count > MAX_ENTRIES {
1808            return Err(format!("too many semantic index entries: {}", entry_count));
1809        }
1810
1811        // Fingerprint handling:
1812        //   - V1: no fingerprint field at all.
1813        //   - V2: fingerprint_len + fingerprint bytes; always present (writer
1814        //     only emitted V2 when fingerprint was Some).
1815        //   - V3+: fingerprint_len always present; fingerprint_len==0 ⇒ None.
1816        let has_fingerprint_field = version == SEMANTIC_INDEX_VERSION_V2
1817            || version == SEMANTIC_INDEX_VERSION_V3
1818            || version == SEMANTIC_INDEX_VERSION_V4
1819            || version == SEMANTIC_INDEX_VERSION_V5
1820            || version == SEMANTIC_INDEX_VERSION_V6;
1821        let fingerprint = if has_fingerprint_field {
1822            let fingerprint_len = read_u32(data, &mut pos)? as usize;
1823            if pos + fingerprint_len > data.len() {
1824                return Err("unexpected end of data reading fingerprint".to_string());
1825            }
1826            if fingerprint_len == 0 {
1827                None
1828            } else {
1829                let raw = String::from_utf8_lossy(&data[pos..pos + fingerprint_len]).to_string();
1830                pos += fingerprint_len;
1831                Some(
1832                    serde_json::from_str::<SemanticIndexFingerprint>(&raw)
1833                        .map_err(|error| format!("invalid semantic fingerprint: {error}"))?,
1834                )
1835            }
1836        } else {
1837            None
1838        };
1839
1840        // File mtimes
1841        let mtime_count = read_u32(data, &mut pos)? as usize;
1842        if mtime_count > MAX_ENTRIES {
1843            return Err(format!("too many semantic file mtimes: {}", mtime_count));
1844        }
1845
1846        let vector_bytes = entry_count
1847            .checked_mul(dimension)
1848            .and_then(|count| count.checked_mul(F32_BYTES))
1849            .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
1850        if vector_bytes > data.len().saturating_sub(pos) {
1851            return Err("semantic index vectors exceed available data".to_string());
1852        }
1853
1854        let mut file_mtimes = HashMap::with_capacity(mtime_count);
1855        let mut file_sizes = HashMap::with_capacity(mtime_count);
1856        let mut file_hashes = HashMap::with_capacity(mtime_count);
1857        for _ in 0..mtime_count {
1858            let path = read_string(data, &mut pos)?;
1859            let secs = read_u64(data, &mut pos)?;
1860            // V3+ persists subsec_nanos alongside secs so staleness checks
1861            // survive restart round-trips. V1/V2 load with 0 nanos, which
1862            // causes one rebuild on upgrade (they never matched live APFS
1863            // mtimes anyway — the bug v0.15.2 fixes). After that rebuild,
1864            // the cache is persisted as V3 and stabilises.
1865            let nanos = if version == SEMANTIC_INDEX_VERSION_V3
1866                || version == SEMANTIC_INDEX_VERSION_V4
1867                || version == SEMANTIC_INDEX_VERSION_V5
1868                || version == SEMANTIC_INDEX_VERSION_V6
1869            {
1870                read_u32(data, &mut pos)?
1871            } else {
1872                0
1873            };
1874            let size =
1875                if version == SEMANTIC_INDEX_VERSION_V5 || version == SEMANTIC_INDEX_VERSION_V6 {
1876                    read_u64(data, &mut pos)?
1877                } else {
1878                    0
1879                };
1880            let content_hash = if version == SEMANTIC_INDEX_VERSION_V6 {
1881                if pos + 32 > data.len() {
1882                    return Err("unexpected end of data reading content hash".to_string());
1883                }
1884                let mut hash_bytes = [0u8; 32];
1885                hash_bytes.copy_from_slice(&data[pos..pos + 32]);
1886                pos += 32;
1887                blake3::Hash::from_bytes(hash_bytes)
1888            } else {
1889                cache_freshness::zero_hash()
1890            };
1891            // Hardening against corrupt / maliciously crafted cache files
1892            // (v0.15.2). `Duration::new(secs, nanos)` can panic when the
1893            // nanosecond carry overflows the second counter, and
1894            // `SystemTime + Duration` can panic on carry past the platform's
1895            // upper bound. Explicit validation keeps a corrupted semantic.bin
1896            // from taking down the whole aft process.
1897            if nanos >= 1_000_000_000 {
1898                return Err(format!(
1899                    "invalid semantic mtime: nanos {} >= 1_000_000_000",
1900                    nanos
1901                ));
1902            }
1903            let duration = std::time::Duration::new(secs, nanos);
1904            let mtime = SystemTime::UNIX_EPOCH
1905                .checked_add(duration)
1906                .ok_or_else(|| {
1907                    format!(
1908                        "invalid semantic mtime: secs={} nanos={} overflows SystemTime",
1909                        secs, nanos
1910                    )
1911                })?;
1912            let path = if version == SEMANTIC_INDEX_VERSION_V6 {
1913                current_canonical_root.join(PathBuf::from(path))
1914            } else {
1915                PathBuf::from(path)
1916            };
1917            file_mtimes.insert(path.clone(), mtime);
1918            file_sizes.insert(path.clone(), size);
1919            file_hashes.insert(path, content_hash);
1920        }
1921
1922        // Entries
1923        let mut entries = Vec::with_capacity(entry_count);
1924        for _ in 0..entry_count {
1925            let raw_file = PathBuf::from(read_string(data, &mut pos)?);
1926            let file = if version == SEMANTIC_INDEX_VERSION_V6 {
1927                current_canonical_root.join(raw_file)
1928            } else {
1929                raw_file
1930            };
1931            let name = read_string(data, &mut pos)?;
1932
1933            if pos >= data.len() {
1934                return Err("unexpected end of data".to_string());
1935            }
1936            let kind = u8_to_symbol_kind(data[pos]);
1937            pos += 1;
1938
1939            let start_line = read_u32(data, &mut pos)?;
1940            let end_line = read_u32(data, &mut pos)?;
1941
1942            if pos >= data.len() {
1943                return Err("unexpected end of data".to_string());
1944            }
1945            let exported = data[pos] != 0;
1946            pos += 1;
1947
1948            let snippet = read_string(data, &mut pos)?;
1949            let embed_text = read_string(data, &mut pos)?;
1950
1951            // Vector
1952            let vec_bytes = dimension
1953                .checked_mul(F32_BYTES)
1954                .ok_or_else(|| "semantic vector allocation overflow".to_string())?;
1955            if pos + vec_bytes > data.len() {
1956                return Err("unexpected end of data reading vector".to_string());
1957            }
1958            let mut vector = Vec::with_capacity(dimension);
1959            for _ in 0..dimension {
1960                let bytes = [data[pos], data[pos + 1], data[pos + 2], data[pos + 3]];
1961                vector.push(f32::from_le_bytes(bytes));
1962                pos += 4;
1963            }
1964
1965            entries.push(EmbeddingEntry {
1966                chunk: SemanticChunk {
1967                    file,
1968                    name,
1969                    kind,
1970                    start_line,
1971                    end_line,
1972                    exported,
1973                    embed_text,
1974                    snippet,
1975                },
1976                vector,
1977            });
1978        }
1979
1980        if entries.len() != entry_count {
1981            return Err(format!(
1982                "semantic cache entry count drift: header={} decoded={}",
1983                entry_count,
1984                entries.len()
1985            ));
1986        }
1987        for entry in &entries {
1988            if !file_mtimes.contains_key(&entry.chunk.file) {
1989                return Err(format!(
1990                    "semantic cache metadata missing for entry file {}",
1991                    entry.chunk.file.display()
1992                ));
1993            }
1994        }
1995
1996        Ok(Self {
1997            entries,
1998            file_mtimes,
1999            file_sizes,
2000            file_hashes,
2001            dimension,
2002            fingerprint,
2003            project_root: current_canonical_root.to_path_buf(),
2004        })
2005    }
2006}
2007
2008/// Build enriched embedding text from a symbol with cAST-style context
2009fn build_embed_text(symbol: &Symbol, source: &str, file: &Path, project_root: &Path) -> String {
2010    let relative = file
2011        .strip_prefix(project_root)
2012        .unwrap_or(file)
2013        .to_string_lossy();
2014
2015    let kind_label = match &symbol.kind {
2016        SymbolKind::Function => "function",
2017        SymbolKind::Class => "class",
2018        SymbolKind::Method => "method",
2019        SymbolKind::Struct => "struct",
2020        SymbolKind::Interface => "interface",
2021        SymbolKind::Enum => "enum",
2022        SymbolKind::TypeAlias => "type",
2023        SymbolKind::Variable => "variable",
2024        SymbolKind::Heading => "heading",
2025        SymbolKind::FileSummary => "file-summary",
2026    };
2027
2028    // Build: "file:relative/path kind:function name:validateAuth signature:fn validateAuth(token: &str) -> bool"
2029    let name = &symbol.name;
2030    let mut text = format!(
2031        "name:{name} file:{} kind:{} name:{name}",
2032        relative, kind_label
2033    );
2034
2035    if let Some(sig) = &symbol.signature {
2036        text.push_str(&format!(" signature:{}", sig));
2037    }
2038
2039    // Add body snippet (first ~300 chars of symbol body)
2040    let lines: Vec<&str> = source.lines().collect();
2041    let start = (symbol.range.start_line as usize).min(lines.len());
2042    // range.end_line is inclusive 0-based; +1 makes it an exclusive slice bound.
2043    let end = (symbol.range.end_line as usize + 1).min(lines.len());
2044    if start < end {
2045        let body: String = lines[start..end]
2046            .iter()
2047            .take(15) // max 15 lines
2048            .copied()
2049            .collect::<Vec<&str>>()
2050            .join("\n");
2051        let snippet = if body.len() > 300 {
2052            format!("{}...", &body[..body.floor_char_boundary(300)])
2053        } else {
2054            body
2055        };
2056        text.push_str(&format!(" body:{}", snippet));
2057    }
2058
2059    text
2060}
2061
2062fn truncate_chars(value: &str, max_chars: usize) -> String {
2063    value.chars().take(max_chars).collect()
2064}
2065
2066fn first_leading_doc_comment(source: &str) -> String {
2067    let lines: Vec<&str> = source.lines().collect();
2068    let Some((start, first)) = lines
2069        .iter()
2070        .enumerate()
2071        .find(|(_, line)| !line.trim().is_empty())
2072    else {
2073        return String::new();
2074    };
2075
2076    let trimmed = first.trim_start();
2077    if trimmed.starts_with("/**") {
2078        let mut comment = Vec::new();
2079        for line in lines.iter().skip(start) {
2080            comment.push(*line);
2081            if line.contains("*/") {
2082                break;
2083            }
2084        }
2085        return truncate_chars(&comment.join("\n"), 200);
2086    }
2087
2088    if trimmed.starts_with("///") || trimmed.starts_with("//!") {
2089        let comment = lines
2090            .iter()
2091            .skip(start)
2092            .take_while(|line| {
2093                let trimmed = line.trim_start();
2094                trimmed.starts_with("///") || trimmed.starts_with("//!")
2095            })
2096            .copied()
2097            .collect::<Vec<_>>()
2098            .join("\n");
2099        return truncate_chars(&comment, 200);
2100    }
2101
2102    String::new()
2103}
2104
2105pub fn build_file_summary_chunk(
2106    file: &Path,
2107    project_root: &Path,
2108    source: &str,
2109    top_exports: &[&str],
2110    top_export_signatures: &[Option<&str>],
2111) -> SemanticChunk {
2112    let relative = file.strip_prefix(project_root).unwrap_or(file);
2113    let rel_path = relative.to_string_lossy();
2114    let parent_dir = relative
2115        .parent()
2116        .map(|parent| parent.to_string_lossy().to_string())
2117        .unwrap_or_default();
2118    let name = file
2119        .file_stem()
2120        .map(|stem| stem.to_string_lossy().to_string())
2121        .unwrap_or_default();
2122    let doc = first_leading_doc_comment(source);
2123    let exports = top_exports
2124        .iter()
2125        .take(5)
2126        .copied()
2127        .collect::<Vec<_>>()
2128        .join(",");
2129    let snippet = if doc.is_empty() {
2130        top_export_signatures
2131            .first()
2132            .and_then(|signature| signature.as_deref())
2133            .map(|signature| truncate_chars(signature, 200))
2134            .unwrap_or_default()
2135    } else {
2136        doc.clone()
2137    };
2138
2139    SemanticChunk {
2140        file: file.to_path_buf(),
2141        name,
2142        kind: SymbolKind::FileSummary,
2143        start_line: 0,
2144        end_line: 0,
2145        exported: false,
2146        embed_text: format!(
2147            "file:{rel_path} kind:file-summary name:{} parent:{parent_dir} doc:{doc} exports:{exports}",
2148            file.file_stem()
2149                .map(|stem| stem.to_string_lossy().to_string())
2150                .unwrap_or_default()
2151        ),
2152        snippet,
2153    }
2154}
2155
2156fn parser_for(
2157    parsers: &mut HashMap<crate::parser::LangId, Parser>,
2158    lang: crate::parser::LangId,
2159) -> Result<&mut Parser, String> {
2160    use std::collections::hash_map::Entry;
2161
2162    match parsers.entry(lang) {
2163        Entry::Occupied(entry) => Ok(entry.into_mut()),
2164        Entry::Vacant(entry) => {
2165            let grammar = grammar_for(lang);
2166            let mut parser = Parser::new();
2167            parser
2168                .set_language(&grammar)
2169                .map_err(|error| error.to_string())?;
2170            Ok(entry.insert(parser))
2171        }
2172    }
2173}
2174
2175pub fn is_semantic_indexed_extension(path: &Path) -> bool {
2176    matches!(
2177        path.extension().and_then(|extension| extension.to_str()),
2178        Some(
2179            "ts" | "tsx"
2180                | "js"
2181                | "jsx"
2182                | "py"
2183                | "rs"
2184                | "go"
2185                | "c"
2186                | "h"
2187                | "cc"
2188                | "cpp"
2189                | "cxx"
2190                | "hpp"
2191                | "hh"
2192                | "zig"
2193                | "cs"
2194                | "sh"
2195                | "bash"
2196                | "zsh"
2197                | "sol"
2198                | "vue"
2199        )
2200    )
2201}
2202
2203fn collect_file_metadata(file: &Path) -> Result<IndexedFileMetadata, String> {
2204    let metadata = fs::metadata(file).map_err(|error| error.to_string())?;
2205    let mtime = metadata.modified().map_err(|error| error.to_string())?;
2206    let content_hash = cache_freshness::hash_file_if_small(file, metadata.len())
2207        .map_err(|error| error.to_string())?
2208        .unwrap_or_else(cache_freshness::zero_hash);
2209    Ok(IndexedFileMetadata {
2210        mtime,
2211        size: metadata.len(),
2212        content_hash,
2213    })
2214}
2215
2216fn collect_file_chunks(
2217    project_root: &Path,
2218    file: &Path,
2219    parsers: &mut HashMap<crate::parser::LangId, Parser>,
2220) -> Result<Vec<SemanticChunk>, String> {
2221    if !is_semantic_indexed_extension(file) {
2222        return Err("unsupported file extension".to_string());
2223    }
2224    let lang = detect_language(file).ok_or_else(|| "unsupported file extension".to_string())?;
2225    let source = std::fs::read_to_string(file).map_err(|error| error.to_string())?;
2226    let tree = parser_for(parsers, lang)?
2227        .parse(&source, None)
2228        .ok_or_else(|| format!("tree-sitter parse returned None for {}", file.display()))?;
2229    let symbols =
2230        extract_symbols_from_tree(&source, &tree, lang).map_err(|error| error.to_string())?;
2231
2232    Ok(symbols_to_chunks(file, &symbols, &source, project_root))
2233}
2234
2235/// Build a display snippet from a symbol's source
2236fn build_snippet(symbol: &Symbol, source: &str) -> String {
2237    let lines: Vec<&str> = source.lines().collect();
2238    let start = (symbol.range.start_line as usize).min(lines.len());
2239    // range.end_line is inclusive 0-based; +1 makes it an exclusive slice bound.
2240    let end = (symbol.range.end_line as usize + 1).min(lines.len());
2241    if start < end {
2242        let snippet_lines: Vec<&str> = lines[start..end].iter().take(5).copied().collect();
2243        let mut snippet = snippet_lines.join("\n");
2244        if end - start > 5 {
2245            snippet.push_str("\n  ...");
2246        }
2247        if snippet.len() > 300 {
2248            snippet = format!("{}...", &snippet[..snippet.floor_char_boundary(300)]);
2249        }
2250        snippet
2251    } else {
2252        String::new()
2253    }
2254}
2255
2256/// Convert symbols to semantic chunks with enriched context
2257fn symbols_to_chunks(
2258    file: &Path,
2259    symbols: &[Symbol],
2260    source: &str,
2261    project_root: &Path,
2262) -> Vec<SemanticChunk> {
2263    let mut chunks = Vec::new();
2264    let top_exports_with_signatures = symbols
2265        .iter()
2266        .filter(|symbol| {
2267            symbol.exported
2268                && symbol.parent.is_none()
2269                && !matches!(symbol.kind, SymbolKind::Heading)
2270        })
2271        .map(|symbol| (symbol.name.as_str(), symbol.signature.as_deref()))
2272        .collect::<Vec<_>>();
2273
2274    let has_only_headings = !symbols.is_empty()
2275        && symbols
2276            .iter()
2277            .all(|symbol| matches!(symbol.kind, SymbolKind::Heading));
2278    if top_exports_with_signatures.len() <= 2 && !has_only_headings {
2279        let top_exports = top_exports_with_signatures
2280            .iter()
2281            .map(|(name, _)| *name)
2282            .collect::<Vec<_>>();
2283        let top_export_signatures = top_exports_with_signatures
2284            .iter()
2285            .map(|(_, signature)| *signature)
2286            .collect::<Vec<_>>();
2287        chunks.push(build_file_summary_chunk(
2288            file,
2289            project_root,
2290            source,
2291            &top_exports,
2292            &top_export_signatures,
2293        ));
2294    }
2295
2296    for symbol in symbols {
2297        // Skip Markdown / HTML heading chunks: empirically they dominate result
2298        // lists even for code-shaped queries because heading prose embeds well.
2299        // Agents querying for code lose the actual matches under doc noise.
2300        // README/docs queries are still served by grep on the same files.
2301        if matches!(symbol.kind, SymbolKind::Heading) {
2302            continue;
2303        }
2304
2305        // Skip very small symbols (single-line variables, etc.)
2306        let line_count = symbol
2307            .range
2308            .end_line
2309            .saturating_sub(symbol.range.start_line)
2310            + 1;
2311        if line_count < 2 && !matches!(symbol.kind, SymbolKind::Variable) {
2312            continue;
2313        }
2314
2315        let embed_text = build_embed_text(symbol, source, file, project_root);
2316        let snippet = build_snippet(symbol, source);
2317
2318        chunks.push(SemanticChunk {
2319            file: file.to_path_buf(),
2320            name: symbol.name.clone(),
2321            kind: symbol.kind.clone(),
2322            start_line: symbol.range.start_line,
2323            end_line: symbol.range.end_line,
2324            exported: symbol.exported,
2325            embed_text,
2326            snippet,
2327        });
2328
2329        // Note: Nested symbols are handled separately by the outline system
2330        // Each symbol is indexed individually
2331    }
2332
2333    chunks
2334}
2335
2336/// Cosine similarity between two vectors
2337fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
2338    if a.len() != b.len() {
2339        return 0.0;
2340    }
2341
2342    let mut dot = 0.0f32;
2343    let mut norm_a = 0.0f32;
2344    let mut norm_b = 0.0f32;
2345
2346    for i in 0..a.len() {
2347        dot += a[i] * b[i];
2348        norm_a += a[i] * a[i];
2349        norm_b += b[i] * b[i];
2350    }
2351
2352    let denom = norm_a.sqrt() * norm_b.sqrt();
2353    if denom == 0.0 {
2354        0.0
2355    } else {
2356        dot / denom
2357    }
2358}
2359
2360// Serialization helpers
2361fn symbol_kind_to_u8(kind: &SymbolKind) -> u8 {
2362    match kind {
2363        SymbolKind::Function => 0,
2364        SymbolKind::Class => 1,
2365        SymbolKind::Method => 2,
2366        SymbolKind::Struct => 3,
2367        SymbolKind::Interface => 4,
2368        SymbolKind::Enum => 5,
2369        SymbolKind::TypeAlias => 6,
2370        SymbolKind::Variable => 7,
2371        SymbolKind::Heading => 8,
2372        SymbolKind::FileSummary => 9,
2373    }
2374}
2375
2376fn u8_to_symbol_kind(v: u8) -> SymbolKind {
2377    match v {
2378        0 => SymbolKind::Function,
2379        1 => SymbolKind::Class,
2380        2 => SymbolKind::Method,
2381        3 => SymbolKind::Struct,
2382        4 => SymbolKind::Interface,
2383        5 => SymbolKind::Enum,
2384        6 => SymbolKind::TypeAlias,
2385        7 => SymbolKind::Variable,
2386        8 => SymbolKind::Heading,
2387        9 => SymbolKind::FileSummary,
2388        _ => SymbolKind::Heading,
2389    }
2390}
2391
2392fn read_u32(data: &[u8], pos: &mut usize) -> Result<u32, String> {
2393    if *pos + 4 > data.len() {
2394        return Err("unexpected end of data reading u32".to_string());
2395    }
2396    let val = u32::from_le_bytes([data[*pos], data[*pos + 1], data[*pos + 2], data[*pos + 3]]);
2397    *pos += 4;
2398    Ok(val)
2399}
2400
2401fn read_u64(data: &[u8], pos: &mut usize) -> Result<u64, String> {
2402    if *pos + 8 > data.len() {
2403        return Err("unexpected end of data reading u64".to_string());
2404    }
2405    let bytes: [u8; 8] = data[*pos..*pos + 8].try_into().unwrap();
2406    *pos += 8;
2407    Ok(u64::from_le_bytes(bytes))
2408}
2409
2410fn read_string(data: &[u8], pos: &mut usize) -> Result<String, String> {
2411    let len = read_u32(data, pos)? as usize;
2412    if *pos + len > data.len() {
2413        return Err("unexpected end of data reading string".to_string());
2414    }
2415    let s = String::from_utf8_lossy(&data[*pos..*pos + len]).to_string();
2416    *pos += len;
2417    Ok(s)
2418}
2419
2420#[cfg(test)]
2421mod tests {
2422    use super::*;
2423    use crate::config::{SemanticBackend, SemanticBackendConfig};
2424    use crate::parser::FileParser;
2425    use std::io::{Read, Write};
2426    use std::net::TcpListener;
2427    use std::thread;
2428
2429    fn start_mock_http_server<F>(handler: F) -> (String, thread::JoinHandle<()>)
2430    where
2431        F: Fn(String, String, String) -> String + Send + 'static,
2432    {
2433        let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
2434        let addr = listener.local_addr().expect("local addr");
2435        let handle = thread::spawn(move || {
2436            let (mut stream, _) = listener.accept().expect("accept request");
2437            let mut buf = Vec::new();
2438            let mut chunk = [0u8; 4096];
2439            let mut header_end = None;
2440            let mut content_length = 0usize;
2441            loop {
2442                let n = stream.read(&mut chunk).expect("read request");
2443                if n == 0 {
2444                    break;
2445                }
2446                buf.extend_from_slice(&chunk[..n]);
2447                if header_end.is_none() {
2448                    if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
2449                        header_end = Some(pos + 4);
2450                        let headers = String::from_utf8_lossy(&buf[..pos + 4]);
2451                        for line in headers.lines() {
2452                            if let Some(value) = line.strip_prefix("Content-Length:") {
2453                                content_length = value.trim().parse::<usize>().unwrap_or(0);
2454                            }
2455                        }
2456                    }
2457                }
2458                if let Some(end) = header_end {
2459                    if buf.len() >= end + content_length {
2460                        break;
2461                    }
2462                }
2463            }
2464
2465            let end = header_end.expect("header terminator");
2466            let request = String::from_utf8_lossy(&buf[..end]).to_string();
2467            let body = String::from_utf8_lossy(&buf[end..end + content_length]).to_string();
2468            let mut lines = request.lines();
2469            let request_line = lines.next().expect("request line").to_string();
2470            let path = request_line
2471                .split_whitespace()
2472                .nth(1)
2473                .expect("request path")
2474                .to_string();
2475            let response_body = handler(request_line, path, body);
2476            let response = format!(
2477                "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
2478                response_body.len(),
2479                response_body
2480            );
2481            stream
2482                .write_all(response.as_bytes())
2483                .expect("write response");
2484        });
2485
2486        (format!("http://{}", addr), handle)
2487    }
2488
2489    fn test_vector_for_texts(texts: Vec<String>) -> Result<Vec<Vec<f32>>, String> {
2490        Ok(texts.iter().map(|_| vec![1.0, 0.0, 0.0]).collect())
2491    }
2492
2493    fn write_rust_file(path: &Path, function_name: &str) {
2494        fs::write(
2495            path,
2496            format!("pub fn {function_name}() -> bool {{\n    true\n}}\n"),
2497        )
2498        .unwrap();
2499    }
2500
2501    fn build_test_index(project_root: &Path, files: &[PathBuf]) -> SemanticIndex {
2502        let mut embed = test_vector_for_texts;
2503        SemanticIndex::build(project_root, files, &mut embed, 8).unwrap()
2504    }
2505
2506    fn test_project_root() -> PathBuf {
2507        std::env::current_dir().unwrap()
2508    }
2509
2510    fn set_file_metadata(index: &mut SemanticIndex, file: &Path, mtime: SystemTime, size: u64) {
2511        index.file_mtimes.insert(file.to_path_buf(), mtime);
2512        index.file_sizes.insert(file.to_path_buf(), size);
2513        index
2514            .file_hashes
2515            .insert(file.to_path_buf(), cache_freshness::zero_hash());
2516    }
2517
2518    #[test]
2519    fn test_cosine_similarity_identical() {
2520        let a = vec![1.0, 0.0, 0.0];
2521        let b = vec![1.0, 0.0, 0.0];
2522        assert!((cosine_similarity(&a, &b) - 1.0).abs() < 0.001);
2523    }
2524
2525    #[test]
2526    fn test_cosine_similarity_orthogonal() {
2527        let a = vec![1.0, 0.0, 0.0];
2528        let b = vec![0.0, 1.0, 0.0];
2529        assert!(cosine_similarity(&a, &b).abs() < 0.001);
2530    }
2531
2532    #[test]
2533    fn test_cosine_similarity_opposite() {
2534        let a = vec![1.0, 0.0, 0.0];
2535        let b = vec![-1.0, 0.0, 0.0];
2536        assert!((cosine_similarity(&a, &b) + 1.0).abs() < 0.001);
2537    }
2538
2539    #[test]
2540    fn test_serialization_roundtrip() {
2541        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
2542        index.entries.push(EmbeddingEntry {
2543            chunk: SemanticChunk {
2544                file: PathBuf::from("/src/main.rs"),
2545                name: "handle_request".to_string(),
2546                kind: SymbolKind::Function,
2547                start_line: 10,
2548                end_line: 25,
2549                exported: true,
2550                embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
2551                snippet: "fn handle_request() {\n  // ...\n}".to_string(),
2552            },
2553            vector: vec![0.1, 0.2, 0.3, 0.4],
2554        });
2555        index.dimension = 4;
2556        index
2557            .file_mtimes
2558            .insert(PathBuf::from("/src/main.rs"), SystemTime::UNIX_EPOCH);
2559        index.file_sizes.insert(PathBuf::from("/src/main.rs"), 0);
2560        index.set_fingerprint(SemanticIndexFingerprint {
2561            backend: "fastembed".to_string(),
2562            model: "all-MiniLM-L6-v2".to_string(),
2563            base_url: FALLBACK_BACKEND.to_string(),
2564            dimension: 4,
2565            chunking_version: default_chunking_version(),
2566        });
2567
2568        let bytes = index.to_bytes();
2569        let restored = SemanticIndex::from_bytes(&bytes, &test_project_root()).unwrap();
2570
2571        assert_eq!(restored.entries.len(), 1);
2572        assert_eq!(restored.entries[0].chunk.name, "handle_request");
2573        assert_eq!(restored.entries[0].vector, vec![0.1, 0.2, 0.3, 0.4]);
2574        assert_eq!(restored.dimension, 4);
2575        assert_eq!(restored.backend_label(), Some("fastembed"));
2576        assert_eq!(restored.model_label(), Some("all-MiniLM-L6-v2"));
2577    }
2578
2579    #[test]
2580    fn symbol_kind_serialization_roundtrip_includes_file_summary_variant() {
2581        let cases = [
2582            (SymbolKind::Function, 0),
2583            (SymbolKind::Class, 1),
2584            (SymbolKind::Method, 2),
2585            (SymbolKind::Struct, 3),
2586            (SymbolKind::Interface, 4),
2587            (SymbolKind::Enum, 5),
2588            (SymbolKind::TypeAlias, 6),
2589            (SymbolKind::Variable, 7),
2590            (SymbolKind::Heading, 8),
2591            (SymbolKind::FileSummary, 9),
2592        ];
2593
2594        for (kind, encoded) in cases {
2595            assert_eq!(symbol_kind_to_u8(&kind), encoded);
2596            assert_eq!(u8_to_symbol_kind(encoded), kind);
2597        }
2598    }
2599
2600    #[test]
2601    fn test_search_top_k() {
2602        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
2603        index.dimension = 3;
2604
2605        // Add entries with known vectors
2606        for (i, name) in ["auth", "database", "handler"].iter().enumerate() {
2607            let mut vec = vec![0.0f32; 3];
2608            vec[i] = 1.0; // orthogonal vectors
2609            index.entries.push(EmbeddingEntry {
2610                chunk: SemanticChunk {
2611                    file: PathBuf::from("/src/lib.rs"),
2612                    name: name.to_string(),
2613                    kind: SymbolKind::Function,
2614                    start_line: (i * 10 + 1) as u32,
2615                    end_line: (i * 10 + 5) as u32,
2616                    exported: true,
2617                    embed_text: format!("kind:function name:{}", name),
2618                    snippet: format!("fn {}() {{}}", name),
2619                },
2620                vector: vec,
2621            });
2622        }
2623
2624        // Query aligned with "auth" (index 0)
2625        let query = vec![0.9, 0.1, 0.0];
2626        let results = index.search(&query, 2);
2627
2628        assert_eq!(results.len(), 2);
2629        assert_eq!(results[0].name, "auth"); // highest score
2630        assert!(results[0].score > results[1].score);
2631    }
2632
2633    #[test]
2634    fn test_empty_index_search() {
2635        let index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
2636        let results = index.search(&[0.1, 0.2, 0.3], 10);
2637        assert!(results.is_empty());
2638    }
2639
2640    #[test]
2641    fn single_line_symbol_builds_non_empty_snippet() {
2642        let symbol = Symbol {
2643            name: "answer".to_string(),
2644            kind: SymbolKind::Variable,
2645            range: crate::symbols::Range {
2646                start_line: 0,
2647                start_col: 0,
2648                end_line: 0,
2649                end_col: 24,
2650            },
2651            signature: Some("const answer = 42".to_string()),
2652            scope_chain: Vec::new(),
2653            exported: true,
2654            parent: None,
2655        };
2656        let source = "export const answer = 42;\n";
2657
2658        let snippet = build_snippet(&symbol, source);
2659
2660        assert_eq!(snippet, "export const answer = 42;");
2661    }
2662
2663    #[test]
2664    fn optimized_file_chunk_collection_matches_file_parser_path() {
2665        let project_root = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
2666        let file = project_root.join("src/semantic_index.rs");
2667        let source = std::fs::read_to_string(&file).unwrap();
2668
2669        let mut legacy_parser = FileParser::new();
2670        let legacy_symbols = legacy_parser.extract_symbols(&file).unwrap();
2671        let legacy_chunks = symbols_to_chunks(&file, &legacy_symbols, &source, &project_root);
2672
2673        let mut parsers = HashMap::new();
2674        let optimized_chunks = collect_file_chunks(&project_root, &file, &mut parsers).unwrap();
2675
2676        assert_eq!(
2677            chunk_fingerprint(&optimized_chunks),
2678            chunk_fingerprint(&legacy_chunks)
2679        );
2680    }
2681
2682    fn chunk_fingerprint(
2683        chunks: &[SemanticChunk],
2684    ) -> Vec<(String, SymbolKind, u32, u32, bool, String, String)> {
2685        chunks
2686            .iter()
2687            .map(|chunk| {
2688                (
2689                    chunk.name.clone(),
2690                    chunk.kind.clone(),
2691                    chunk.start_line,
2692                    chunk.end_line,
2693                    chunk.exported,
2694                    chunk.embed_text.clone(),
2695                    chunk.snippet.clone(),
2696                )
2697            })
2698            .collect()
2699    }
2700
2701    #[test]
2702    fn rejects_oversized_dimension_during_deserialization() {
2703        let mut bytes = Vec::new();
2704        bytes.push(1u8);
2705        bytes.extend_from_slice(&((MAX_DIMENSION as u32) + 1).to_le_bytes());
2706        bytes.extend_from_slice(&0u32.to_le_bytes());
2707        bytes.extend_from_slice(&0u32.to_le_bytes());
2708
2709        assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
2710    }
2711
2712    #[test]
2713    fn rejects_oversized_entry_count_during_deserialization() {
2714        let mut bytes = Vec::new();
2715        bytes.push(1u8);
2716        bytes.extend_from_slice(&(DEFAULT_DIMENSION as u32).to_le_bytes());
2717        bytes.extend_from_slice(&((MAX_ENTRIES as u32) + 1).to_le_bytes());
2718        bytes.extend_from_slice(&0u32.to_le_bytes());
2719
2720        assert!(SemanticIndex::from_bytes(&bytes, &test_project_root()).is_err());
2721    }
2722
2723    #[test]
2724    fn invalidate_file_removes_entries_and_mtime() {
2725        let target = PathBuf::from("/src/main.rs");
2726        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
2727        index.entries.push(EmbeddingEntry {
2728            chunk: SemanticChunk {
2729                file: target.clone(),
2730                name: "main".to_string(),
2731                kind: SymbolKind::Function,
2732                start_line: 0,
2733                end_line: 1,
2734                exported: false,
2735                embed_text: "main".to_string(),
2736                snippet: "fn main() {}".to_string(),
2737            },
2738            vector: vec![1.0; DEFAULT_DIMENSION],
2739        });
2740        index
2741            .file_mtimes
2742            .insert(target.clone(), SystemTime::UNIX_EPOCH);
2743        index.file_sizes.insert(target.clone(), 0);
2744
2745        index.invalidate_file(&target);
2746
2747        assert!(index.entries.is_empty());
2748        assert!(!index.file_mtimes.contains_key(&target));
2749        assert!(!index.file_sizes.contains_key(&target));
2750    }
2751
2752    #[test]
2753    fn refresh_transient_error_preserves_existing_entry_and_mtime() {
2754        let temp = tempfile::tempdir().unwrap();
2755        let project_root = temp.path();
2756        let file = project_root.join("src/lib.rs");
2757        fs::create_dir_all(file.parent().unwrap()).unwrap();
2758        write_rust_file(&file, "kept_symbol");
2759
2760        let mut index = build_test_index(project_root, std::slice::from_ref(&file));
2761        let original_entry_count = index.entries.len();
2762        let original_mtime = *index.file_mtimes.get(&file).unwrap();
2763        let original_size = *index.file_sizes.get(&file).unwrap();
2764
2765        let stale_mtime = SystemTime::UNIX_EPOCH;
2766        set_file_metadata(&mut index, &file, stale_mtime, original_size + 1);
2767        fs::remove_file(&file).unwrap();
2768
2769        let mut embed = test_vector_for_texts;
2770        let mut progress = |_done: usize, _total: usize| {};
2771        let summary = index
2772            .refresh_stale_files(
2773                project_root,
2774                std::slice::from_ref(&file),
2775                &mut embed,
2776                8,
2777                &mut progress,
2778            )
2779            .unwrap();
2780
2781        assert_eq!(summary.changed, 0);
2782        assert_eq!(summary.added, 0);
2783        assert_eq!(summary.deleted, 0);
2784        assert_eq!(index.entries.len(), original_entry_count);
2785        assert!(index
2786            .entries
2787            .iter()
2788            .any(|entry| entry.chunk.name == "kept_symbol"));
2789        assert_eq!(index.file_mtimes.get(&file), Some(&stale_mtime));
2790        assert_ne!(index.file_mtimes.get(&file), Some(&original_mtime));
2791        assert_eq!(index.file_sizes.get(&file), Some(&(original_size + 1)));
2792    }
2793
2794    #[test]
2795    fn refresh_never_indexed_file_error_does_not_record_mtime() {
2796        let temp = tempfile::tempdir().unwrap();
2797        let project_root = temp.path();
2798        let missing = project_root.join("src/missing.rs");
2799        fs::create_dir_all(missing.parent().unwrap()).unwrap();
2800
2801        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
2802        let mut embed = test_vector_for_texts;
2803        let mut progress = |_done: usize, _total: usize| {};
2804        let summary = index
2805            .refresh_stale_files(
2806                project_root,
2807                std::slice::from_ref(&missing),
2808                &mut embed,
2809                8,
2810                &mut progress,
2811            )
2812            .unwrap();
2813
2814        assert_eq!(summary.added, 0);
2815        assert_eq!(summary.changed, 0);
2816        assert_eq!(summary.deleted, 0);
2817        assert!(!index.file_mtimes.contains_key(&missing));
2818        assert!(!index.file_sizes.contains_key(&missing));
2819        assert!(index.entries.is_empty());
2820    }
2821
2822    #[test]
2823    fn refresh_reports_added_for_new_files() {
2824        let temp = tempfile::tempdir().unwrap();
2825        let project_root = temp.path();
2826        let existing = project_root.join("src/lib.rs");
2827        let added = project_root.join("src/new.rs");
2828        fs::create_dir_all(existing.parent().unwrap()).unwrap();
2829        write_rust_file(&existing, "existing_symbol");
2830        write_rust_file(&added, "added_symbol");
2831
2832        let mut index = build_test_index(project_root, std::slice::from_ref(&existing));
2833        let mut embed = test_vector_for_texts;
2834        let mut progress = |_done: usize, _total: usize| {};
2835        let summary = index
2836            .refresh_stale_files(
2837                project_root,
2838                &[existing.clone(), added.clone()],
2839                &mut embed,
2840                8,
2841                &mut progress,
2842            )
2843            .unwrap();
2844
2845        assert_eq!(summary.added, 1);
2846        assert_eq!(summary.changed, 0);
2847        assert_eq!(summary.deleted, 0);
2848        assert_eq!(summary.total_processed, 2);
2849        assert!(index.file_mtimes.contains_key(&added));
2850        assert!(index.entries.iter().any(|entry| entry.chunk.file == added));
2851    }
2852
2853    #[test]
2854    fn refresh_reports_deleted_for_removed_files() {
2855        let temp = tempfile::tempdir().unwrap();
2856        let project_root = temp.path();
2857        let deleted = project_root.join("src/deleted.rs");
2858        fs::create_dir_all(deleted.parent().unwrap()).unwrap();
2859        write_rust_file(&deleted, "deleted_symbol");
2860
2861        let mut index = build_test_index(project_root, std::slice::from_ref(&deleted));
2862        fs::remove_file(&deleted).unwrap();
2863
2864        let mut embed = test_vector_for_texts;
2865        let mut progress = |_done: usize, _total: usize| {};
2866        let summary = index
2867            .refresh_stale_files(project_root, &[], &mut embed, 8, &mut progress)
2868            .unwrap();
2869
2870        assert_eq!(summary.deleted, 1);
2871        assert_eq!(summary.changed, 0);
2872        assert_eq!(summary.added, 0);
2873        assert_eq!(summary.total_processed, 1);
2874        assert!(!index.file_mtimes.contains_key(&deleted));
2875        assert!(index.entries.is_empty());
2876    }
2877
2878    #[test]
2879    fn refresh_reports_changed_for_modified_files() {
2880        let temp = tempfile::tempdir().unwrap();
2881        let project_root = temp.path();
2882        let file = project_root.join("src/lib.rs");
2883        fs::create_dir_all(file.parent().unwrap()).unwrap();
2884        write_rust_file(&file, "old_symbol");
2885
2886        let mut index = build_test_index(project_root, std::slice::from_ref(&file));
2887        set_file_metadata(&mut index, &file, SystemTime::UNIX_EPOCH, 0);
2888        write_rust_file(&file, "new_symbol");
2889
2890        let mut embed = test_vector_for_texts;
2891        let mut progress = |_done: usize, _total: usize| {};
2892        let summary = index
2893            .refresh_stale_files(
2894                project_root,
2895                std::slice::from_ref(&file),
2896                &mut embed,
2897                8,
2898                &mut progress,
2899            )
2900            .unwrap();
2901
2902        assert_eq!(summary.changed, 1);
2903        assert_eq!(summary.added, 0);
2904        assert_eq!(summary.deleted, 0);
2905        assert_eq!(summary.total_processed, 1);
2906        assert!(index
2907            .entries
2908            .iter()
2909            .any(|entry| entry.chunk.name == "new_symbol"));
2910        assert!(!index
2911            .entries
2912            .iter()
2913            .any(|entry| entry.chunk.name == "old_symbol"));
2914    }
2915
2916    #[test]
2917    fn refresh_all_clean_reports_zero_counts_and_no_embedding_work() {
2918        let temp = tempfile::tempdir().unwrap();
2919        let project_root = temp.path();
2920        let file = project_root.join("src/lib.rs");
2921        fs::create_dir_all(file.parent().unwrap()).unwrap();
2922        write_rust_file(&file, "clean_symbol");
2923
2924        let mut index = build_test_index(project_root, std::slice::from_ref(&file));
2925        let original_entries = index.entries.len();
2926        let mut embed_called = false;
2927        let mut embed = |texts: Vec<String>| {
2928            embed_called = true;
2929            test_vector_for_texts(texts)
2930        };
2931        let mut progress = |_done: usize, _total: usize| {};
2932        let summary = index
2933            .refresh_stale_files(
2934                project_root,
2935                std::slice::from_ref(&file),
2936                &mut embed,
2937                8,
2938                &mut progress,
2939            )
2940            .unwrap();
2941
2942        assert!(summary.is_noop());
2943        assert_eq!(summary.total_processed, 1);
2944        assert!(!embed_called);
2945        assert_eq!(index.entries.len(), original_entries);
2946    }
2947
2948    #[test]
2949    fn detects_missing_onnx_runtime_from_dynamic_load_error() {
2950        let message = "Failed to load ONNX Runtime shared library libonnxruntime.dylib via dlopen: no such file";
2951
2952        assert!(is_onnx_runtime_unavailable(message));
2953    }
2954
2955    #[test]
2956    fn formats_missing_onnx_runtime_with_install_hint() {
2957        let message = format_embedding_init_error(
2958            "Failed to load ONNX Runtime shared library libonnxruntime.so via dlopen: no such file",
2959        );
2960
2961        assert!(message.starts_with("ONNX Runtime not found. Install via:"));
2962        assert!(message.contains("Original error:"));
2963    }
2964
2965    #[test]
2966    fn openai_compatible_backend_embeds_with_mock_server() {
2967        let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
2968            assert!(request_line.starts_with("POST "));
2969            assert_eq!(path, "/v1/embeddings");
2970            "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0},{\"embedding\":[0.4,0.5,0.6],\"index\":1}]}".to_string()
2971        });
2972
2973        let config = SemanticBackendConfig {
2974            backend: SemanticBackend::OpenAiCompatible,
2975            model: "test-embedding".to_string(),
2976            base_url: Some(base_url),
2977            api_key_env: None,
2978            timeout_ms: 5_000,
2979            max_batch_size: 64,
2980        };
2981
2982        let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
2983        let vectors = model
2984            .embed(vec!["hello".to_string(), "world".to_string()])
2985            .unwrap();
2986
2987        assert_eq!(vectors, vec![vec![0.1, 0.2, 0.3], vec![0.4, 0.5, 0.6]]);
2988        handle.join().unwrap();
2989    }
2990
2991    /// Regression for issue #36: AFT was sending TWO Content-Type headers
2992    /// on the OpenAI embeddings request — once implicitly via `.json(&body)`
2993    /// and again explicitly via `.header("Content-Type", "application/json")`.
2994    /// reqwest's `.header()` calls `HeaderMap::append`, which produces two
2995    /// headers on the wire. OpenAI's /v1/embeddings endpoint rejects that
2996    /// with `HTTP 400 "you must provide a model parameter"` even though the
2997    /// body actually contains `model`. The fix is to drop the explicit
2998    /// `.header("Content-Type", ...)` call. This test pins that we send
2999    /// exactly one Content-Type header.
3000    #[test]
3001    fn openai_compatible_request_has_single_content_type_header() {
3002        use std::sync::{Arc, Mutex};
3003        let captured: Arc<Mutex<Vec<u8>>> = Arc::new(Mutex::new(Vec::new()));
3004        let captured_for_thread = Arc::clone(&captured);
3005
3006        let listener = TcpListener::bind("127.0.0.1:0").expect("bind test server");
3007        let addr = listener.local_addr().expect("local addr");
3008        let handle = thread::spawn(move || {
3009            let (mut stream, _) = listener.accept().expect("accept");
3010            let mut buf = Vec::new();
3011            let mut chunk = [0u8; 4096];
3012            let mut header_end = None;
3013            let mut content_length = 0usize;
3014            loop {
3015                let n = stream.read(&mut chunk).expect("read");
3016                if n == 0 {
3017                    break;
3018                }
3019                buf.extend_from_slice(&chunk[..n]);
3020                if header_end.is_none() {
3021                    if let Some(pos) = buf.windows(4).position(|window| window == b"\r\n\r\n") {
3022                        header_end = Some(pos + 4);
3023                        for line in String::from_utf8_lossy(&buf[..pos + 4]).lines() {
3024                            if let Some(value) = line.strip_prefix("Content-Length:") {
3025                                content_length = value.trim().parse::<usize>().unwrap_or(0);
3026                            }
3027                        }
3028                    }
3029                }
3030                if let Some(end) = header_end {
3031                    if buf.len() >= end + content_length {
3032                        break;
3033                    }
3034                }
3035            }
3036            *captured_for_thread.lock().unwrap() = buf;
3037            let body = "{\"data\":[{\"embedding\":[0.1,0.2,0.3],\"index\":0}]}";
3038            let response = format!(
3039                "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{}",
3040                body.len(),
3041                body
3042            );
3043            let _ = stream.write_all(response.as_bytes());
3044        });
3045
3046        let config = SemanticBackendConfig {
3047            backend: SemanticBackend::OpenAiCompatible,
3048            model: "text-embedding-3-small".to_string(),
3049            base_url: Some(format!("http://{}", addr)),
3050            api_key_env: None,
3051            timeout_ms: 5_000,
3052            max_batch_size: 64,
3053        };
3054        let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
3055        let _ = model.embed(vec!["probe".to_string()]).unwrap();
3056        handle.join().unwrap();
3057
3058        let bytes = captured.lock().unwrap().clone();
3059        let request = String::from_utf8_lossy(&bytes);
3060
3061        // Lowercase line counts because HTTP headers are case-insensitive
3062        // and reqwest may emit `content-type` in lowercase under HTTP/2.
3063        let content_type_lines = request
3064            .lines()
3065            .filter(|line| {
3066                let lower = line.to_ascii_lowercase();
3067                lower.starts_with("content-type:")
3068            })
3069            .count();
3070        assert_eq!(
3071            content_type_lines, 1,
3072            "expected exactly one Content-Type header but found {content_type_lines}; full request:\n{request}",
3073        );
3074
3075        // The body must still include the model field — pin this so a future
3076        // change can't accidentally drop `model` while fixing duplicate headers.
3077        assert!(
3078            request.contains(r#""model":"text-embedding-3-small""#),
3079            "request body should contain model field; full request:\n{request}",
3080        );
3081    }
3082
3083    #[test]
3084    fn ollama_backend_embeds_with_mock_server() {
3085        let (base_url, handle) = start_mock_http_server(|request_line, path, _body| {
3086            assert!(request_line.starts_with("POST "));
3087            assert_eq!(path, "/api/embed");
3088            "{\"embeddings\":[[0.7,0.8,0.9],[1.0,1.1,1.2]]}".to_string()
3089        });
3090
3091        let config = SemanticBackendConfig {
3092            backend: SemanticBackend::Ollama,
3093            model: "embeddinggemma".to_string(),
3094            base_url: Some(base_url),
3095            api_key_env: None,
3096            timeout_ms: 5_000,
3097            max_batch_size: 64,
3098        };
3099
3100        let mut model = SemanticEmbeddingModel::from_config(&config).unwrap();
3101        let vectors = model
3102            .embed(vec!["hello".to_string(), "world".to_string()])
3103            .unwrap();
3104
3105        assert_eq!(vectors, vec![vec![0.7, 0.8, 0.9], vec![1.0, 1.1, 1.2]]);
3106        handle.join().unwrap();
3107    }
3108
3109    #[test]
3110    fn read_from_disk_rejects_fingerprint_mismatch() {
3111        let storage = tempfile::tempdir().unwrap();
3112        let project_key = "proj";
3113
3114        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3115        index.entries.push(EmbeddingEntry {
3116            chunk: SemanticChunk {
3117                file: PathBuf::from("/src/main.rs"),
3118                name: "handle_request".to_string(),
3119                kind: SymbolKind::Function,
3120                start_line: 10,
3121                end_line: 25,
3122                exported: true,
3123                embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
3124                snippet: "fn handle_request() {}".to_string(),
3125            },
3126            vector: vec![0.1, 0.2, 0.3],
3127        });
3128        index.dimension = 3;
3129        index
3130            .file_mtimes
3131            .insert(PathBuf::from("/src/main.rs"), SystemTime::UNIX_EPOCH);
3132        index.file_sizes.insert(PathBuf::from("/src/main.rs"), 0);
3133        index.set_fingerprint(SemanticIndexFingerprint {
3134            backend: "openai_compatible".to_string(),
3135            model: "test-embedding".to_string(),
3136            base_url: "http://127.0.0.1:1234/v1".to_string(),
3137            dimension: 3,
3138            chunking_version: default_chunking_version(),
3139        });
3140        index.write_to_disk(storage.path(), project_key);
3141
3142        let matching = index.fingerprint().unwrap().as_string();
3143        assert!(SemanticIndex::read_from_disk(
3144            storage.path(),
3145            project_key,
3146            &test_project_root(),
3147            false,
3148            Some(&matching),
3149        )
3150        .is_some());
3151
3152        let mismatched = SemanticIndexFingerprint {
3153            backend: "ollama".to_string(),
3154            model: "embeddinggemma".to_string(),
3155            base_url: "http://127.0.0.1:11434".to_string(),
3156            dimension: 3,
3157            chunking_version: default_chunking_version(),
3158        }
3159        .as_string();
3160        assert!(SemanticIndex::read_from_disk(
3161            storage.path(),
3162            project_key,
3163            &test_project_root(),
3164            false,
3165            Some(&mismatched),
3166        )
3167        .is_none());
3168    }
3169
3170    #[test]
3171    fn read_from_disk_rejects_v3_cache_for_snippet_rebuild() {
3172        let storage = tempfile::tempdir().unwrap();
3173        let project_key = "proj-v3";
3174        let dir = storage.path().join("semantic").join(project_key);
3175        fs::create_dir_all(&dir).unwrap();
3176
3177        let mut index = SemanticIndex::new(test_project_root(), DEFAULT_DIMENSION);
3178        index.entries.push(EmbeddingEntry {
3179            chunk: SemanticChunk {
3180                file: PathBuf::from("/src/main.rs"),
3181                name: "handle_request".to_string(),
3182                kind: SymbolKind::Function,
3183                start_line: 0,
3184                end_line: 0,
3185                exported: true,
3186                embed_text: "file:src/main.rs kind:function name:handle_request".to_string(),
3187                snippet: "fn handle_request() {}".to_string(),
3188            },
3189            vector: vec![0.1, 0.2, 0.3],
3190        });
3191        index.dimension = 3;
3192        index
3193            .file_mtimes
3194            .insert(PathBuf::from("/src/main.rs"), SystemTime::UNIX_EPOCH);
3195        index.file_sizes.insert(PathBuf::from("/src/main.rs"), 0);
3196        let fingerprint = SemanticIndexFingerprint {
3197            backend: "fastembed".to_string(),
3198            model: "test".to_string(),
3199            base_url: FALLBACK_BACKEND.to_string(),
3200            dimension: 3,
3201            chunking_version: default_chunking_version(),
3202        };
3203        index.set_fingerprint(fingerprint.clone());
3204
3205        let mut bytes = index.to_bytes();
3206        bytes[0] = SEMANTIC_INDEX_VERSION_V3;
3207        fs::write(dir.join("semantic.bin"), bytes).unwrap();
3208
3209        assert!(SemanticIndex::read_from_disk(
3210            storage.path(),
3211            project_key,
3212            &test_project_root(),
3213            false,
3214            Some(&fingerprint.as_string())
3215        )
3216        .is_none());
3217        assert!(!dir.join("semantic.bin").exists());
3218    }
3219
3220    fn make_symbol(kind: SymbolKind, name: &str, start: u32, end: u32) -> crate::symbols::Symbol {
3221        crate::symbols::Symbol {
3222            name: name.to_string(),
3223            kind,
3224            range: crate::symbols::Range {
3225                start_line: start,
3226                start_col: 0,
3227                end_line: end,
3228                end_col: 0,
3229            },
3230            signature: None,
3231            scope_chain: Vec::new(),
3232            exported: false,
3233            parent: None,
3234        }
3235    }
3236
3237    /// Heading symbols (Markdown / HTML headings) must NOT be indexed —
3238    /// they overwhelmingly dominated semantic results even on code-shaped
3239    /// queries because heading prose embeds far more strongly than code
3240    /// chunks. Skipping headings keeps aft_search a code-finder.
3241    #[test]
3242    fn symbols_to_chunks_skips_heading_symbols() {
3243        let project_root = PathBuf::from("/proj");
3244        let file = project_root.join("README.md");
3245        let source = "# Title\n\nbody text\n\n## Section\n\nmore text\n";
3246
3247        let symbols = vec![
3248            make_symbol(SymbolKind::Heading, "Title", 0, 2),
3249            make_symbol(SymbolKind::Heading, "Section", 4, 6),
3250        ];
3251
3252        let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
3253        assert!(
3254            chunks.is_empty(),
3255            "Heading symbols must be filtered out before embedding; got {} chunk(s)",
3256            chunks.len()
3257        );
3258    }
3259
3260    /// Code symbols (functions, classes, methods, structs, etc.) must still
3261    /// be indexed alongside the heading skip — otherwise we'd starve the
3262    /// index entirely.
3263    #[test]
3264    fn symbols_to_chunks_keeps_code_symbols_alongside_skipped_headings() {
3265        let project_root = PathBuf::from("/proj");
3266        let file = project_root.join("src/lib.rs");
3267        let source = "pub fn handle_request() -> bool {\n    true\n}\n";
3268
3269        let symbols = vec![
3270            // A heading mixed in (e.g. from a doc comment block elsewhere).
3271            make_symbol(SymbolKind::Heading, "doc heading", 0, 1),
3272            make_symbol(SymbolKind::Function, "handle_request", 0, 2),
3273            make_symbol(SymbolKind::Struct, "AuthService", 4, 6),
3274        ];
3275
3276        let chunks = symbols_to_chunks(&file, &symbols, source, &project_root);
3277        assert_eq!(
3278            chunks.len(),
3279            3,
3280            "Expected file-summary + 2 code chunks (Function + Struct), got {}",
3281            chunks.len()
3282        );
3283        let names: Vec<&str> = chunks.iter().map(|c| c.name.as_str()).collect();
3284        assert!(chunks
3285            .iter()
3286            .any(|chunk| matches!(chunk.kind, SymbolKind::FileSummary)));
3287        assert!(names.contains(&"handle_request"));
3288        assert!(names.contains(&"AuthService"));
3289        assert!(
3290            !names.contains(&"doc heading"),
3291            "Heading symbol leaked into chunks: {names:?}"
3292        );
3293    }
3294
3295    #[test]
3296    fn validate_ssrf_allows_loopback_hostnames() {
3297        // Loopback hostnames are explicitly allowed so self-hosted backends
3298        // (Ollama at http://localhost:11434) work at their default config.
3299        for host in &[
3300            "http://localhost",
3301            "http://localhost:8080",
3302            "http://localhost:11434", // Ollama default
3303            "http://localhost.localdomain",
3304            "http://foo.localhost",
3305        ] {
3306            assert!(
3307                validate_base_url_no_ssrf(host).is_ok(),
3308                "Expected {host} to be allowed (loopback), got: {:?}",
3309                validate_base_url_no_ssrf(host)
3310            );
3311        }
3312    }
3313
3314    #[test]
3315    fn validate_ssrf_allows_loopback_ips() {
3316        // 127.0.0.0/8 is loopback — by definition same-machine and not an
3317        // SSRF target. Allow it so Ollama at http://127.0.0.1:11434 works.
3318        for url in &[
3319            "http://127.0.0.1",
3320            "http://127.0.0.1:11434", // Ollama default
3321            "http://127.0.0.1:8080",
3322            "http://127.1.2.3",
3323        ] {
3324            let result = validate_base_url_no_ssrf(url);
3325            assert!(
3326                result.is_ok(),
3327                "Expected {url} to be allowed (loopback), got: {:?}",
3328                result
3329            );
3330        }
3331    }
3332
3333    #[test]
3334    fn validate_ssrf_rejects_private_non_loopback_ips() {
3335        // Non-loopback private/reserved IPs remain rejected — homelab/intranet
3336        // services on LAN IPs are real SSRF targets even though the user
3337        // configured them. Users who want this can opt in by binding the
3338        // service to a public-routable address.
3339        for url in &[
3340            "http://192.168.1.1",
3341            "http://10.0.0.1",
3342            "http://172.16.0.1",
3343            "http://169.254.169.254",
3344            "http://100.64.0.1",
3345        ] {
3346            let result = validate_base_url_no_ssrf(url);
3347            assert!(
3348                result.is_err(),
3349                "Expected {url} to be rejected (non-loopback private), got: {:?}",
3350                result
3351            );
3352        }
3353    }
3354
3355    #[test]
3356    fn validate_ssrf_rejects_mdns_local_hostnames() {
3357        // mDNS .local hostnames typically resolve to LAN devices, not
3358        // loopback. Rejecting them before DNS lookup gives a clearer error.
3359        for host in &[
3360            "http://printer.local",
3361            "http://nas.local:8080",
3362            "http://homelab.local",
3363        ] {
3364            let result = validate_base_url_no_ssrf(host);
3365            assert!(
3366                result.is_err(),
3367                "Expected {host} to be rejected (mDNS), got: {:?}",
3368                result
3369            );
3370        }
3371    }
3372
3373    #[test]
3374    fn normalize_base_url_allows_localhost_for_tests() {
3375        // normalize_base_url itself should NOT block localhost — only
3376        // validate_base_url_no_ssrf does. Tests construct backends directly.
3377        assert!(normalize_base_url("http://127.0.0.1:9999").is_ok());
3378        assert!(normalize_base_url("http://localhost:8080").is_ok());
3379    }
3380
3381    /// Pin the user-facing wording of the ONNX version-mismatch error.
3382    /// The auto-fix path MUST be listed first because it's the only safe
3383    /// option that doesn't require sudo or risk breaking other apps that
3384    /// link the system library. Regression of any of these strings would
3385    /// either mislead users (system rm before auto-fix) or break the
3386    /// `aft doctor --fix` discovery path.
3387    #[test]
3388    fn ort_mismatch_message_recommends_auto_fix_first() {
3389        let msg =
3390            format_ort_version_mismatch("1.9.0", "/usr/lib/x86_64-linux-gnu/libonnxruntime.so");
3391
3392        // The reported version and path must appear verbatim.
3393        assert!(
3394            msg.contains("v1.9.0"),
3395            "should report detected version: {msg}"
3396        );
3397        assert!(
3398            msg.contains("/usr/lib/x86_64-linux-gnu/libonnxruntime.so"),
3399            "should report system path: {msg}"
3400        );
3401        assert!(msg.contains("v1.20+"), "should state requirement: {msg}");
3402
3403        // Solution ordering: auto-fix is #1, system rm is #2, install is #3.
3404        let auto_fix_pos = msg
3405            .find("Auto-fix")
3406            .expect("Auto-fix solution missing — users won't discover --fix");
3407        let remove_pos = msg
3408            .find("Remove the old library")
3409            .expect("system-rm solution missing");
3410        assert!(
3411            auto_fix_pos < remove_pos,
3412            "Auto-fix must come before manual rm — see PR comment thread"
3413        );
3414
3415        // The auto-fix command must be runnable as-is on a fresh system.
3416        assert!(
3417            msg.contains("npx @cortexkit/aft doctor --fix"),
3418            "auto-fix command must be present and copy-pasteable: {msg}"
3419        );
3420    }
3421
3422    /// macOS dylib paths must not produce a malformed message when the
3423    /// system path lacks a trailing slash. This is a regression guard
3424    /// for the "{}\n{}" format string contract.
3425    #[test]
3426    fn ort_mismatch_message_handles_macos_dylib_path() {
3427        let msg = format_ort_version_mismatch("1.9.0", "/opt/homebrew/lib/libonnxruntime.dylib");
3428        assert!(msg.contains("v1.9.0"));
3429        assert!(msg.contains("/opt/homebrew/lib/libonnxruntime.dylib"));
3430        // The dylib path must appear in the auto-fix paragraph (single
3431        // quotes around it) AND in the manual-rm paragraph; verify
3432        // both placements survived the format string.
3433        assert!(
3434            msg.contains("'/opt/homebrew/lib/libonnxruntime.dylib'"),
3435            "system path should be quoted in the auto-fix sentence: {msg}"
3436        );
3437    }
3438}