seekr-code 1.0.0

A semantic code search engine, smarter than grep. Supports text regex + semantic vector + AST pattern search, 100% local.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
//! Core index storage.
//!
//! Manages a vector index (HNSW + fallback brute-force KNN) for semantic search
//! and an inverted text index for keyword search. Provides build, query, save,
//! and load operations.

use std::collections::HashMap;
use std::path::Path;

use crate::error::IndexError;
use crate::index::{IndexEntry, SearchHit};
use crate::parser::CodeChunk;
use crate::INDEX_VERSION;

// ============================================================
// HNSW Point wrapper
// ============================================================

/// Wrapper around `Vec<f32>` implementing `instant_distance::Point` for HNSW.
///
/// Uses cosine distance (1 - cosine_similarity) as the distance metric,
/// which is appropriate for L2-normalized embedding vectors.
#[derive(Clone, Debug)]
struct EmbeddingPoint(Vec<f32>);

impl instant_distance::Point for EmbeddingPoint {
    fn distance(&self, other: &Self) -> f32 {
        // For L2-normalized vectors, cosine_similarity = dot product.
        // Distance = 1 - similarity (lower is closer).
        let dot: f32 = self.0.iter().zip(other.0.iter()).map(|(a, b)| a * b).sum();
        1.0 - dot
    }
}

/// The main index structure holding both vector and text indices.
///
/// Uses HNSW (Hierarchical Navigable Small Worlds) for fast approximate
/// nearest neighbor search, with brute-force KNN as fallback.
#[derive(serde::Serialize, serde::Deserialize)]
pub struct SeekrIndex {
    /// Index format version for compatibility checks.
    pub version: u32,

    /// Vector index: chunk_id -> embedding vector.
    pub vectors: HashMap<u64, Vec<f32>>,

    /// Inverted text index: token -> list of (chunk_id, frequency).
    pub inverted_index: HashMap<String, Vec<(u64, u32)>>,

    /// Metadata: chunk_id -> stored chunk data.
    pub chunks: HashMap<u64, CodeChunk>,

    /// Embedding dimension.
    pub embedding_dim: usize,

    /// Total number of indexed chunks.
    pub chunk_count: usize,

    /// HNSW index for fast approximate nearest neighbor search.
    /// Built in-memory from the vectors HashMap; not serialized to disk.
    #[serde(skip)]
    hnsw: Option<instant_distance::HnswMap<EmbeddingPoint, u64>>,
}

impl std::fmt::Debug for SeekrIndex {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("SeekrIndex")
            .field("version", &self.version)
            .field("embedding_dim", &self.embedding_dim)
            .field("chunk_count", &self.chunk_count)
            .field("vectors_len", &self.vectors.len())
            .field("hnsw", &self.hnsw.as_ref().map(|_| "Some(<HnswMap>)"))
            .finish()
    }
}

impl SeekrIndex {
    /// Create a new empty index.
    pub fn new(embedding_dim: usize) -> Self {
        Self {
            version: INDEX_VERSION,
            vectors: HashMap::new(),
            inverted_index: HashMap::new(),
            chunks: HashMap::new(),
            embedding_dim,
            chunk_count: 0,
            hnsw: None,
        }
    }

    /// Add an entry to the index.
    pub fn add_entry(&mut self, entry: IndexEntry, chunk: CodeChunk) {
        let chunk_id = entry.chunk_id;

        // Add to vector index
        self.vectors.insert(chunk_id, entry.embedding);

        // Add to inverted text index
        for token in &entry.text_tokens {
            let posting_list = self.inverted_index.entry(token.clone()).or_default();
            if let Some(existing) = posting_list.iter_mut().find(|(id, _)| *id == chunk_id) {
                existing.1 += 1;
            } else {
                posting_list.push((chunk_id, 1));
            }
        }

        // Store chunk metadata
        self.chunks.insert(chunk_id, chunk);
        self.chunk_count = self.chunks.len();
    }

    /// Remove a chunk from the index by ID.
    ///
    /// Removes the chunk from vectors, inverted index, and metadata.
    pub fn remove_chunk(&mut self, chunk_id: u64) {
        // Remove from vector index
        self.vectors.remove(&chunk_id);

        // Remove from inverted text index
        self.inverted_index.retain(|_token, posting_list| {
            posting_list.retain(|(id, _)| *id != chunk_id);
            !posting_list.is_empty()
        });

        // Remove from chunk metadata
        self.chunks.remove(&chunk_id);
        self.chunk_count = self.chunks.len();
    }

    /// Remove multiple chunks by their IDs.
    pub fn remove_chunks(&mut self, chunk_ids: &[u64]) {
        for &chunk_id in chunk_ids {
            self.remove_chunk(chunk_id);
        }
    }

    /// Build the index from chunks and their embeddings.
    ///
    /// Also builds the HNSW graph for fast approximate nearest neighbor search.
    pub fn build_from(
        chunks: &[CodeChunk],
        embeddings: &[Vec<f32>],
        embedding_dim: usize,
    ) -> Self {
        let mut index = Self::new(embedding_dim);

        for (chunk, embedding) in chunks.iter().zip(embeddings.iter()) {
            let text_tokens = tokenize_for_index(&chunk.body);

            let entry = IndexEntry {
                chunk_id: chunk.id,
                embedding: embedding.clone(),
                text_tokens,
            };

            index.add_entry(entry, chunk.clone());
        }

        // Build HNSW graph from all vectors
        index.rebuild_hnsw();

        index
    }

    /// Rebuild the HNSW graph from the current vectors HashMap.
    ///
    /// Called after build_from(), load(), or after incremental updates.
    pub fn rebuild_hnsw(&mut self) {
        if self.vectors.is_empty() {
            self.hnsw = None;
            return;
        }

        let mut points = Vec::with_capacity(self.vectors.len());
        let mut values = Vec::with_capacity(self.vectors.len());

        for (&chunk_id, embedding) in &self.vectors {
            points.push(EmbeddingPoint(embedding.clone()));
            values.push(chunk_id);
        }

        let hnsw_map = instant_distance::Builder::default().build(points, values);
        self.hnsw = Some(hnsw_map);

        tracing::debug!(
            chunks = self.vectors.len(),
            "HNSW graph built"
        );
    }

    /// Perform a vector similarity search.
    ///
    /// Uses HNSW approximate nearest neighbor search when available (O(log n)),
    /// falling back to brute-force KNN (O(n*d)) for older indexes or when
    /// HNSW is not built.
    ///
    /// Returns the top-k most similar chunks by cosine similarity.
    pub fn search_vector(
        &self,
        query_embedding: &[f32],
        top_k: usize,
        score_threshold: f32,
    ) -> Vec<SearchHit> {
        if let Some(ref hnsw) = self.hnsw {
            // Fast path: HNSW approximate nearest neighbor search
            self.search_vector_hnsw(hnsw, query_embedding, top_k, score_threshold)
        } else {
            // Fallback: brute-force KNN (for backward compatibility or small indexes)
            self.search_vector_brute_force(query_embedding, top_k, score_threshold)
        }
    }

    /// HNSW-based vector search (O(log n) per query).
    fn search_vector_hnsw(
        &self,
        hnsw: &instant_distance::HnswMap<EmbeddingPoint, u64>,
        query_embedding: &[f32],
        top_k: usize,
        score_threshold: f32,
    ) -> Vec<SearchHit> {
        let query_point = EmbeddingPoint(query_embedding.to_vec());
        let mut search = instant_distance::Search::default();

        let results: Vec<SearchHit> = hnsw
            .search(&query_point, &mut search)
            .take(top_k)
            .filter_map(|item| {
                let chunk_id = *item.value;
                // Convert distance back to similarity: similarity = 1 - distance
                let score = 1.0 - item.distance;
                if score >= score_threshold {
                    Some(SearchHit { chunk_id, score })
                } else {
                    None
                }
            })
            .collect();

        results
    }

    /// Brute-force KNN vector search (O(n*d) per query).
    ///
    /// Used as fallback when HNSW index is not available.
    fn search_vector_brute_force(
        &self,
        query_embedding: &[f32],
        top_k: usize,
        score_threshold: f32,
    ) -> Vec<SearchHit> {
        let mut scores: Vec<(u64, f32)> = self
            .vectors
            .iter()
            .map(|(&chunk_id, embedding)| {
                let score = cosine_similarity(query_embedding, embedding);
                (chunk_id, score)
            })
            .filter(|(_, score)| *score >= score_threshold)
            .collect();

        // Sort by score descending
        scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));

        scores
            .into_iter()
            .take(top_k)
            .map(|(chunk_id, score)| SearchHit { chunk_id, score })
            .collect()
    }

    /// Perform a text search using the inverted index.
    ///
    /// Returns chunks that contain the query tokens, scored by TF.
    pub fn search_text(&self, query: &str, top_k: usize) -> Vec<SearchHit> {
        let query_tokens = tokenize_for_index(query);

        if query_tokens.is_empty() {
            return Vec::new();
        }

        // Accumulate scores for each chunk
        let mut scores: HashMap<u64, f32> = HashMap::new();

        for token in &query_tokens {
            if let Some(posting_list) = self.inverted_index.get(token) {
                for &(chunk_id, frequency) in posting_list {
                    *scores.entry(chunk_id).or_default() += frequency as f32;
                }
            }
        }

        // Normalize by number of query tokens
        let num_tokens = query_tokens.len() as f32;
        let mut results: Vec<(u64, f32)> = scores
            .into_iter()
            .map(|(id, score)| (id, score / num_tokens))
            .collect();

        results.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));

        results
            .into_iter()
            .take(top_k)
            .map(|(chunk_id, score)| SearchHit { chunk_id, score })
            .collect()
    }

    /// Get a chunk by its ID.
    pub fn get_chunk(&self, chunk_id: u64) -> Option<&CodeChunk> {
        self.chunks.get(&chunk_id)
    }

    /// Save the index to a directory.
    ///
    /// Uses bincode for fast binary serialization (v2 format).
    pub fn save(&self, dir: &Path) -> Result<(), IndexError> {
        std::fs::create_dir_all(dir)?;

        let index_path = dir.join("index.bin");
        let data = bincode::serialize(self)
            .map_err(|e| IndexError::Serialization(e.to_string()))?;
        std::fs::write(&index_path, data)?;

        // Remove old JSON index if present (migration from v1)
        let old_json_path = dir.join("index.json");
        if old_json_path.exists() {
            let _ = std::fs::remove_file(&old_json_path);
        }

        tracing::info!(
            chunks = self.chunk_count,
            path = %dir.display(),
            "Index saved (bincode v2)"
        );

        Ok(())
    }

    /// Load an index from a directory.
    ///
    /// Tries bincode format (v2) first, then falls back to JSON (v1) for migration.
    /// After loading, rebuilds the HNSW graph from the vectors for fast search.
    pub fn load(dir: &Path) -> Result<Self, IndexError> {
        let bin_path = dir.join("index.bin");
        let json_path = dir.join("index.json");

        let mut index: SeekrIndex = if bin_path.exists() {
            // v2: bincode format
            let data = std::fs::read(&bin_path)?;
            bincode::deserialize(&data)
                .map_err(|e| IndexError::Serialization(e.to_string()))?
        } else if json_path.exists() {
            // v1: JSON format (backward compatibility)
            let data = std::fs::read(&json_path)?;
            serde_json::from_slice(&data)
                .map_err(|e| IndexError::Serialization(e.to_string()))?
        } else {
            return Err(IndexError::NotFound(bin_path));
        };

        // Version check
        if index.version != INDEX_VERSION {
            return Err(IndexError::VersionMismatch {
                file_version: index.version,
                expected_version: INDEX_VERSION,
            });
        }

        // Rebuild HNSW graph from loaded vectors
        index.rebuild_hnsw();

        tracing::info!(
            chunks = index.chunk_count,
            path = %dir.display(),
            "Index loaded (HNSW rebuilt)"
        );

        Ok(index)
    }
}

/// Compute cosine similarity between two vectors.
pub fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
    if a.len() != b.len() || a.is_empty() {
        return 0.0;
    }

    let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
    let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
    let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();

    if norm_a == 0.0 || norm_b == 0.0 {
        return 0.0;
    }

    dot / (norm_a * norm_b)
}

/// Simple tokenization for the inverted text index.
///
/// Splits on whitespace and punctuation, lowercases, filters short tokens.
fn tokenize_for_index(text: &str) -> Vec<String> {
    text.split(|c: char| !c.is_alphanumeric() && c != '_')
        .map(|s| s.to_lowercase())
        .filter(|s| s.len() >= 2)
        .collect()
}

/// Public wrapper for `tokenize_for_index` — used by incremental indexing.
pub fn tokenize_for_index_pub(text: &str) -> Vec<String> {
    tokenize_for_index(text)
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::parser::ChunkKind;
    use std::path::PathBuf;

    fn make_test_chunk(id: u64, name: &str, body: &str) -> CodeChunk {
        CodeChunk {
            id,
            file_path: PathBuf::from("test.rs"),
            language: "rust".to_string(),
            kind: ChunkKind::Function,
            name: Some(name.to_string()),
            signature: None,
            doc_comment: None,
            body: body.to_string(),
            byte_range: 0..body.len(),
            line_range: 0..1,
        }
    }

    #[test]
    fn test_cosine_similarity() {
        let a = vec![1.0, 0.0, 0.0];
        let b = vec![0.0, 1.0, 0.0];
        assert!((cosine_similarity(&a, &b)).abs() < 0.01);

        let c = vec![1.0, 0.0, 0.0];
        assert!((cosine_similarity(&a, &c) - 1.0).abs() < 0.01);
    }

    #[test]
    fn test_build_and_search_text() {
        let chunks = vec![
            make_test_chunk(1, "authenticate", "fn authenticate(user: &str, password: &str) -> Result<Token, Error>"),
            make_test_chunk(2, "calculate", "fn calculate_total(items: &[Item]) -> f64"),
        ];
        let embeddings = vec![vec![0.1; 8], vec![0.2; 8]];

        let index = SeekrIndex::build_from(&chunks, &embeddings, 8);

        assert_eq!(index.chunk_count, 2);

        // Text search
        let results = index.search_text("authenticate user password", 10);
        assert!(!results.is_empty());
        assert_eq!(results[0].chunk_id, 1);
    }

    #[test]
    fn test_build_and_search_vector() {
        let chunks = vec![
            make_test_chunk(1, "foo", "fn foo()"),
            make_test_chunk(2, "bar", "fn bar()"),
        ];
        let embeddings = vec![vec![1.0, 0.0, 0.0], vec![0.0, 1.0, 0.0]];

        let index = SeekrIndex::build_from(&chunks, &embeddings, 3);

        // Search for something similar to chunk 1
        let query = vec![0.9, 0.1, 0.0];
        let results = index.search_vector(&query, 2, 0.0);
        assert!(!results.is_empty());
        assert_eq!(results[0].chunk_id, 1, "Should find the most similar chunk first");
    }

    #[test]
    fn test_save_and_load() {
        let chunks = vec![make_test_chunk(1, "test", "fn test() {}")];
        let embeddings = vec![vec![0.5; 4]];
        let index = SeekrIndex::build_from(&chunks, &embeddings, 4);

        let dir = tempfile::tempdir().unwrap();
        index.save(dir.path()).unwrap();

        let loaded = SeekrIndex::load(dir.path()).unwrap();
        assert_eq!(loaded.chunk_count, 1);
        assert_eq!(loaded.version, INDEX_VERSION);
    }

    #[test]
    fn test_tokenize_for_index() {
        let tokens = tokenize_for_index("fn authenticate_user(username: &str) -> Result<String>");
        assert!(tokens.contains(&"fn".to_string()));
        assert!(tokens.contains(&"authenticate_user".to_string()));
        assert!(tokens.contains(&"username".to_string()));
        assert!(tokens.contains(&"result".to_string()));
        assert!(tokens.contains(&"string".to_string()));
    }
}