gitcortex_mcp/
embeddings.rs1use std::collections::HashMap;
15use std::io::{BufWriter, Write};
16use std::path::{Path, PathBuf};
17
18use fastembed::{EmbeddingModel, InitOptions, TextEmbedding};
19use gitcortex_core::graph::Node;
20
21const SIMILARITY_THRESHOLD: f32 = 0.50;
23const DIM: usize = 384;
24
25const MAGIC: &[u8; 4] = b"GCXV";
27const FORMAT_VERSION: u32 = 1;
28
29pub struct SemanticIndex {
32 vectors: HashMap<String, Vec<f32>>,
34 path: PathBuf,
35}
36
37impl SemanticIndex {
38 pub fn load_or_create(path: &Path) -> Self {
39 let vectors = load_bin(path).unwrap_or_default();
40 if !vectors.is_empty() {
41 tracing::info!(
42 "semantic index loaded: {} vectors from {}",
43 vectors.len(),
44 path.display()
45 );
46 }
47 Self {
48 vectors,
49 path: path.to_owned(),
50 }
51 }
52
53 pub fn has(&self, node_id: &str) -> bool {
54 self.vectors.contains_key(node_id)
55 }
56
57 pub fn insert(&mut self, node_id: String, vec: Vec<f32>) {
58 self.vectors.insert(node_id, unit_normalise(vec));
59 }
60
61 pub fn len(&self) -> usize {
62 self.vectors.len()
63 }
64
65 pub fn is_empty(&self) -> bool {
66 self.vectors.is_empty()
67 }
68
69 pub fn retain_ids(&mut self, live_ids: &std::collections::HashSet<String>) -> usize {
74 let before = self.vectors.len();
75 self.vectors.retain(|id, _| live_ids.contains(id));
76 before - self.vectors.len()
77 }
78
79 pub fn save(&self) {
80 if let Err(e) = save_bin(&self.path, &self.vectors) {
81 tracing::warn!("failed to save semantic index: {e}");
82 }
83 }
84
85 pub fn top_k(&self, query_vec: &[f32], k: usize) -> Vec<String> {
88 let q = unit_normalise(query_vec.to_vec());
89 let mut scores: Vec<(&String, f32)> = self
90 .vectors
91 .iter()
92 .map(|(id, v)| (id, dot(&q, v)))
93 .filter(|(_, s)| *s >= SIMILARITY_THRESHOLD)
94 .collect();
95 scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
96 scores
97 .into_iter()
98 .take(k)
99 .map(|(id, _)| id.clone())
100 .collect()
101 }
102}
103
104pub struct Embedder {
107 model: TextEmbedding,
108}
109
110impl Embedder {
111 pub fn new() -> anyhow::Result<Self> {
113 tracing::info!("initialising semantic embedder (AllMiniLM-L6-v2) …");
114 let model = TextEmbedding::try_new(
115 InitOptions::new(EmbeddingModel::AllMiniLML6V2).with_show_download_progress(false),
116 )?;
117 tracing::info!("semantic embedder ready");
118 Ok(Self { model })
119 }
120
121 pub fn embed_one(&self, text: &str) -> anyhow::Result<Vec<f32>> {
122 let mut out = self.model.embed(vec![text.to_owned()], None)?;
123 out.pop()
124 .ok_or_else(|| anyhow::anyhow!("embedder returned no vectors"))
125 }
126
127 pub fn embed_batch(&self, texts: Vec<String>) -> anyhow::Result<Vec<Vec<f32>>> {
129 self.model.embed(texts, None)
130 }
131}
132
133pub fn node_text(n: &Node) -> String {
138 let kind = n.kind.to_string();
139 let sig = &n.metadata.definition.signature;
140 let doc = n.metadata.definition.doc_comment.as_deref().unwrap_or("");
141 if sig.is_empty() && doc.is_empty() {
142 format!("{kind} {}", n.qualified_name)
143 } else if doc.is_empty() {
144 format!("{kind} {} {sig}", n.qualified_name)
145 } else {
146 format!("{kind} {} {sig} {doc}", n.qualified_name)
147 }
148}
149
150fn dot(a: &[f32], b: &[f32]) -> f32 {
153 a.iter().zip(b.iter()).map(|(x, y)| x * y).sum()
154}
155
156fn unit_normalise(mut v: Vec<f32>) -> Vec<f32> {
157 let norm: f32 = v.iter().map(|x| x * x).sum::<f32>().sqrt();
158 if norm > f32::EPSILON {
159 for x in &mut v {
160 *x /= norm;
161 }
162 }
163 v
164}
165
166fn load_bin(path: &Path) -> Option<HashMap<String, Vec<f32>>> {
179 let data = std::fs::read(path).ok()?;
180 let mut p = 0usize;
181
182 macro_rules! read_u32 {
183 () => {{
184 let b: [u8; 4] = data.get(p..p + 4)?.try_into().ok()?;
185 p += 4;
186 u32::from_le_bytes(b)
187 }};
188 }
189
190 if data.get(p..p + 4)? != MAGIC {
191 return None;
192 }
193 p += 4;
194
195 let _ver = read_u32!();
196 let dim = read_u32!() as usize;
197 let count = read_u32!() as usize;
198
199 let mut map = HashMap::with_capacity(count);
200 for _ in 0..count {
201 let id_len = read_u32!() as usize;
202 let id = String::from_utf8(data.get(p..p + id_len)?.to_vec()).ok()?;
203 p += id_len;
204 let end = p + dim * 4;
205 let vec: Vec<f32> = data
206 .get(p..end)?
207 .chunks_exact(4)
208 .map(|b| f32::from_le_bytes(b.try_into().unwrap()))
209 .collect();
210 p = end;
211 map.insert(id, vec);
212 }
213 Some(map)
214}
215
216fn save_bin(path: &Path, vectors: &HashMap<String, Vec<f32>>) -> std::io::Result<()> {
217 if let Some(parent) = path.parent() {
218 std::fs::create_dir_all(parent)?;
219 }
220 let tmp = path.with_extension("tmp");
221 {
222 let f = std::fs::File::create(&tmp)?;
223 let mut w = BufWriter::new(f);
224 w.write_all(MAGIC)?;
225 w.write_all(&FORMAT_VERSION.to_le_bytes())?;
226 w.write_all(&(DIM as u32).to_le_bytes())?;
227 w.write_all(&(vectors.len() as u32).to_le_bytes())?;
228 for (id, vec) in vectors {
229 let id_b = id.as_bytes();
230 w.write_all(&(id_b.len() as u32).to_le_bytes())?;
231 w.write_all(id_b)?;
232 for &v in vec {
233 w.write_all(&v.to_le_bytes())?;
234 }
235 }
236 w.flush()?;
237 }
238 std::fs::rename(&tmp, path)?;
239 Ok(())
240}