avocado_core/
approx.rs

1use crate::{Result, ScoredSpan, Span};
2use crate::index::cosine_similarity;
3use serde::{Serialize, Deserialize};
4use std::path::Path;
5
6/// Minimal ANN abstraction for build/search/save/load
7pub trait ApproxIndex: Sized {
8    /// Build from fully-embedded spans (embedding must be present)
9    fn build(spans: Vec<Span>) -> Self;
10
11    /// Search top-k results for given query embedding
12    fn search(&self, query_embedding: &[f32], k: usize) -> Result<Vec<ScoredSpan>>;
13
14    /// Save owned index to disk
15    fn save_to_disk(&self, dir: &Path) -> Result<()>;
16
17    /// Load owned index from disk
18    fn load_from_disk(dir: &Path) -> Result<Option<Self>>;
19
20    /// Return referenced spans
21    fn spans(&self) -> &[Span];
22}
23
24/// HNSW adapter delegating to existing `VectorIndex`
25pub struct HnswBackend(pub crate::index::VectorIndex);
26
27impl ApproxIndex for HnswBackend {
28    fn build(spans: Vec<Span>) -> Self {
29        Self(crate::index::VectorIndex::build(spans))
30    }
31
32    fn search(&self, query_embedding: &[f32], k: usize) -> Result<Vec<ScoredSpan>> {
33        self.0.search(query_embedding, k)
34    }
35
36    fn save_to_disk(&self, dir: &Path) -> Result<()> {
37        self.0.save_to_disk(dir)
38    }
39
40    fn load_from_disk(dir: &Path) -> Result<Option<Self>> {
41        Ok(crate::index::VectorIndex::load_from_disk(dir)?.map(HnswBackend))
42    }
43
44    fn spans(&self) -> &[Span] {
45        self.0.spans()
46    }
47}
48
49/// Instant backend (spike): owned vectors + brute-force search (placeholder)
50///
51/// Note: This is a spike scaffold to validate owned save/load and determinism,
52/// not a performance implementation. It can be swapped with a real
53/// instant-distance index while keeping the ApproxIndex API intact.
54#[derive(Serialize, Deserialize)]
55pub struct InstantBackend {
56    dimension: usize,
57    spans: Vec<Span>,
58    embeddings: Vec<Vec<f32>>,
59}
60
61#[derive(Serialize, Deserialize)]
62struct InstantBackendOnDisk {
63    version: u32,
64    dimension: usize,
65    spans: Vec<SpanLite>,
66    embeddings: Vec<Vec<f32>>,
67}
68
69#[derive(Serialize, Deserialize, Clone)]
70struct SpanLite {
71    id: String,
72    artifact_id: String,
73    start_line: usize,
74    end_line: usize,
75    text: String,
76    token_count: usize,
77    embedding_model: Option<String>,
78}
79
80impl From<&Span> for SpanLite {
81    fn from(s: &Span) -> Self {
82        SpanLite {
83            id: s.id.clone(),
84            artifact_id: s.artifact_id.clone(),
85            start_line: s.start_line,
86            end_line: s.end_line,
87            text: s.text.clone(),
88            token_count: s.token_count,
89            embedding_model: s.embedding_model.clone(),
90        }
91    }
92}
93
94impl From<SpanLite> for Span {
95    fn from(s: SpanLite) -> Self {
96        Span {
97            id: s.id,
98            artifact_id: s.artifact_id,
99            start_line: s.start_line,
100            end_line: s.end_line,
101            text: s.text,
102            embedding: None, // set separately from embeddings vec
103            embedding_model: s.embedding_model,
104            token_count: s.token_count,
105            metadata: None,
106        }
107    }
108}
109
110impl ApproxIndex for InstantBackend {
111    fn build(spans: Vec<Span>) -> Self {
112        // Collect embeddings; assume all spans have embeddings with same dimension
113        let embeddings: Vec<Vec<f32>> = spans
114            .iter()
115            .map(|s| s.embedding.clone().unwrap_or_default())
116            .collect();
117        let dimension = embeddings.first().map(|e| e.len()).unwrap_or(0);
118        Self {
119            dimension,
120            spans,
121            embeddings,
122        }
123    }
124
125    fn search(&self, query_embedding: &[f32], k: usize) -> Result<Vec<ScoredSpan>> {
126        if query_embedding.len() != self.dimension || self.dimension == 0 {
127            return Ok(Vec::new());
128        }
129        // Brute force cosine similarity (deterministic)
130        let mut scored: Vec<ScoredSpan> = self
131            .spans
132            .iter()
133            .zip(self.embeddings.iter())
134            .map(|(s, emb)| {
135                let score = if emb.len() == self.dimension {
136                    cosine_similarity(query_embedding, emb)
137                } else {
138                    0.0
139                };
140                ScoredSpan {
141                    span: s.clone(),
142                    score,
143                }
144            })
145            .collect();
146        // Sort by score desc, then (artifact_id, start_line) for determinism
147        scored.sort_by(|a, b| {
148            b.score
149                .partial_cmp(&a.score)
150                .unwrap_or(std::cmp::Ordering::Equal)
151                .then_with(|| a.span.artifact_id.cmp(&b.span.artifact_id))
152                .then_with(|| a.span.start_line.cmp(&b.span.start_line))
153        });
154        scored.truncate(k.min(scored.len()));
155        Ok(scored)
156    }
157
158    fn save_to_disk(&self, dir: &Path) -> Result<()> {
159        use std::fs;
160        fs::create_dir_all(dir)?;
161        // Store a reduced on-disk format that avoids serde_json dependency
162        let slim: Vec<SpanLite> = self.spans.iter().map(SpanLite::from).collect();
163        let on_disk = InstantBackendOnDisk {
164            version: 1,
165            dimension: self.dimension,
166            spans: slim,
167            embeddings: self.embeddings.clone(),
168        };
169        let data = bincode::serialize(&on_disk)
170            .map_err(|e| crate::types::Error::Other(anyhow::anyhow!("serialize instant index: {}", e)))?;
171        let tmp = dir.join("instant.idx.tmp");
172        let dst = dir.join("instant.idx");
173        fs::write(&tmp, data)?;
174        fs::rename(tmp, dst)?;
175        Ok(())
176    }
177
178    fn load_from_disk(dir: &Path) -> Result<Option<Self>> {
179        use std::fs;
180        let path = dir.join("instant.idx");
181        if !path.exists() {
182            return Ok(None);
183        }
184        let bytes = fs::read(path)?;
185        let on_disk: InstantBackendOnDisk = bincode::deserialize(&bytes)
186            .map_err(|e| crate::types::Error::Other(anyhow::anyhow!("deserialize instant index: {}", e)))?;
187        if on_disk.version != 1 {
188            return Ok(None);
189        }
190        // Reconstruct spans without embeddings; embeddings stored separately
191        let spans: Vec<Span> = on_disk.spans.into_iter().map(Span::from).collect();
192        Ok(Some(InstantBackend {
193            dimension: on_disk.dimension,
194            spans,
195            embeddings: on_disk.embeddings,
196        }))
197    }
198
199    fn spans(&self) -> &[Span] {
200        &self.spans
201    }
202}
203