avocado_core/
approx.rs

1//! Approximate Nearest Neighbor (ANN) index abstraction
2//!
3//! Provides a minimal trait for building and searching vector indexes.
4
5use crate::{Result, ScoredSpan, Span};
6use crate::index::cosine_similarity;
7use serde::{Serialize, Deserialize};
8use std::path::Path;
9
10/// Minimal ANN abstraction for build/search/save/load
11pub trait ApproxIndex: Sized {
12    /// Build from fully-embedded spans (embedding must be present)
13    fn build(spans: Vec<Span>) -> Self;
14
15    /// Search top-k results for given query embedding
16    fn search(&self, query_embedding: &[f32], k: usize) -> Result<Vec<ScoredSpan>>;
17
18    /// Save owned index to disk
19    fn save_to_disk(&self, dir: &Path) -> Result<()>;
20
21    /// Load owned index from disk
22    fn load_from_disk(dir: &Path) -> Result<Option<Self>>;
23
24    /// Return referenced spans
25    fn spans(&self) -> &[Span];
26}
27
28/// HNSW adapter delegating to existing `VectorIndex`
29pub struct HnswBackend(pub crate::index::VectorIndex);
30
31impl ApproxIndex for HnswBackend {
32    fn build(spans: Vec<Span>) -> Self {
33        Self(crate::index::VectorIndex::build(spans))
34    }
35
36    fn search(&self, query_embedding: &[f32], k: usize) -> Result<Vec<ScoredSpan>> {
37        self.0.search(query_embedding, k)
38    }
39
40    fn save_to_disk(&self, dir: &Path) -> Result<()> {
41        self.0.save_to_disk(dir)
42    }
43
44    fn load_from_disk(dir: &Path) -> Result<Option<Self>> {
45        Ok(crate::index::VectorIndex::load_from_disk(dir)?.map(HnswBackend))
46    }
47
48    fn spans(&self) -> &[Span] {
49        self.0.spans()
50    }
51}
52
53/// Instant backend (spike): owned vectors + brute-force search (placeholder)
54///
55/// Note: This is a spike scaffold to validate owned save/load and determinism,
56/// not a performance implementation. It can be swapped with a real
57/// instant-distance index while keeping the ApproxIndex API intact.
58#[derive(Serialize, Deserialize)]
59pub struct InstantBackend {
60    dimension: usize,
61    spans: Vec<Span>,
62    embeddings: Vec<Vec<f32>>,
63}
64
65#[derive(Serialize, Deserialize)]
66struct InstantBackendOnDisk {
67    version: u32,
68    dimension: usize,
69    spans: Vec<SpanLite>,
70    embeddings: Vec<Vec<f32>>,
71}
72
73#[derive(Serialize, Deserialize, Clone)]
74struct SpanLite {
75    id: String,
76    artifact_id: String,
77    start_line: usize,
78    end_line: usize,
79    text: String,
80    token_count: usize,
81    embedding_model: Option<String>,
82}
83
84impl From<&Span> for SpanLite {
85    fn from(s: &Span) -> Self {
86        SpanLite {
87            id: s.id.clone(),
88            artifact_id: s.artifact_id.clone(),
89            start_line: s.start_line,
90            end_line: s.end_line,
91            text: s.text.clone(),
92            token_count: s.token_count,
93            embedding_model: s.embedding_model.clone(),
94        }
95    }
96}
97
98impl From<SpanLite> for Span {
99    fn from(s: SpanLite) -> Self {
100        Span {
101            id: s.id,
102            artifact_id: s.artifact_id,
103            start_line: s.start_line,
104            end_line: s.end_line,
105            text: s.text,
106            embedding: None, // set separately from embeddings vec
107            embedding_model: s.embedding_model,
108            token_count: s.token_count,
109            metadata: None,
110        }
111    }
112}
113
114impl ApproxIndex for InstantBackend {
115    fn build(spans: Vec<Span>) -> Self {
116        // Collect embeddings; assume all spans have embeddings with same dimension
117        let embeddings: Vec<Vec<f32>> = spans
118            .iter()
119            .map(|s| s.embedding.clone().unwrap_or_default())
120            .collect();
121        let dimension = embeddings.first().map(|e| e.len()).unwrap_or(0);
122        Self {
123            dimension,
124            spans,
125            embeddings,
126        }
127    }
128
129    fn search(&self, query_embedding: &[f32], k: usize) -> Result<Vec<ScoredSpan>> {
130        if query_embedding.len() != self.dimension || self.dimension == 0 {
131            return Ok(Vec::new());
132        }
133        // Brute force cosine similarity (deterministic)
134        let mut scored: Vec<ScoredSpan> = self
135            .spans
136            .iter()
137            .zip(self.embeddings.iter())
138            .map(|(s, emb)| {
139                let score = if emb.len() == self.dimension {
140                    cosine_similarity(query_embedding, emb)
141                } else {
142                    0.0
143                };
144                ScoredSpan {
145                    span: s.clone(),
146                    score,
147                }
148            })
149            .collect();
150        // Sort by score desc, then (artifact_id, start_line) for determinism
151        scored.sort_by(|a, b| {
152            b.score
153                .partial_cmp(&a.score)
154                .unwrap_or(std::cmp::Ordering::Equal)
155                .then_with(|| a.span.artifact_id.cmp(&b.span.artifact_id))
156                .then_with(|| a.span.start_line.cmp(&b.span.start_line))
157        });
158        scored.truncate(k.min(scored.len()));
159        Ok(scored)
160    }
161
162    fn save_to_disk(&self, dir: &Path) -> Result<()> {
163        use std::fs;
164        fs::create_dir_all(dir)?;
165        // Store a reduced on-disk format that avoids serde_json dependency
166        let slim: Vec<SpanLite> = self.spans.iter().map(SpanLite::from).collect();
167        let on_disk = InstantBackendOnDisk {
168            version: 1,
169            dimension: self.dimension,
170            spans: slim,
171            embeddings: self.embeddings.clone(),
172        };
173        let data = bincode::serialize(&on_disk)
174            .map_err(|e| crate::types::Error::Other(anyhow::anyhow!("serialize instant index: {}", e)))?;
175        let tmp = dir.join("instant.idx.tmp");
176        let dst = dir.join("instant.idx");
177        fs::write(&tmp, data)?;
178        fs::rename(tmp, dst)?;
179        Ok(())
180    }
181
182    fn load_from_disk(dir: &Path) -> Result<Option<Self>> {
183        use std::fs;
184        let path = dir.join("instant.idx");
185        if !path.exists() {
186            return Ok(None);
187        }
188        let bytes = fs::read(path)?;
189        let on_disk: InstantBackendOnDisk = bincode::deserialize(&bytes)
190            .map_err(|e| crate::types::Error::Other(anyhow::anyhow!("deserialize instant index: {}", e)))?;
191        if on_disk.version != 1 {
192            return Ok(None);
193        }
194        // Reconstruct spans without embeddings; embeddings stored separately
195        let spans: Vec<Span> = on_disk.spans.into_iter().map(Span::from).collect();
196        Ok(Some(InstantBackend {
197            dimension: on_disk.dimension,
198            spans,
199            embeddings: on_disk.embeddings,
200        }))
201    }
202
203    fn spans(&self) -> &[Span] {
204        &self.spans
205    }
206}
207