1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
//! File-level operations on [`CodeIndexer`]: removal, lookup, and entity access.
//!
//! Why: chunk removal (single id or whole file) and entity lookups are
//! orthogonal to the search/ingest hot paths. Lifting them out keeps each
//! `impl` block focused on a single concern.
//! What: `remove_file`, `remove_chunk`, the shared `remove_chunks_from_stores`
//! helper, `find_chunk_id`, `entities_for`, and `entity_exact_match`.
//! Test: covered by `test_remove_chunk_removes_from_results`,
//! `test_entity_exact_match_*` in `indexer::tests`.
use anyhow::Result;
use crate::core::chunker::RawChunk;
use crate::core::entity::EntityType;
use super::{build_compact_snippet, raw_to_code_chunk, CodeChunk, CodeIndexer};
impl CodeIndexer {
/// Find a chunk whose `file` ends with `file_suffix` and (optionally) whose
/// `function_name` equals `function`. When `function` is `None`, returns
/// the lowest-line-numbered chunk in the matching file. Returns the chunk
/// id, or `None` when nothing matches.
pub async fn find_chunk_id(&self, file_suffix: &str, function: Option<&str>) -> Option<String> {
self.ensure_chunks_loaded().await;
let chunks = self.chunks.read().await;
let matching: Vec<&RawChunk> = chunks
.values()
.filter(|c| c.file.ends_with(file_suffix))
.filter(|c| match function {
Some(f) => c.function_name.as_deref() == Some(f),
None => true,
})
.collect();
// Pick the earliest chunk in the file for stability.
matching
.into_iter()
.min_by_key(|c| c.start_line)
.map(|c| c.id.clone())
}
/// Snapshot every chunk in the corpus as a `CodeChunk`. Used by the
/// quality / complexity endpoints (issue #32) which need to materialize
/// per-chunk metrics without going through the search pipeline.
pub async fn all_chunks(&self) -> Vec<CodeChunk> {
self.ensure_chunks_loaded().await;
let chunks = self.chunks.read().await;
let root = self.root_path.clone();
chunks
.values()
.map(|raw| raw_to_code_chunk(raw, 0.0, "all", None, &root))
.collect()
}
/// Snapshot every `RawChunk` in the corpus (issue #76).
///
/// Why: the `get_call_chain` tool needs the full source body and doc
/// comments of every candidate function, not the projected `CodeChunk`
/// shape returned by [`Self::all_chunks`]. Returning `RawChunk` clones
/// keeps the read lock window tiny and lets the caller process chunks
/// without holding any indexer lock.
/// What: clones every `RawChunk` while briefly holding the read lock.
/// Test: covered by `service::call_chain::tests`.
pub async fn raw_chunks_snapshot(&self) -> Vec<RawChunk> {
self.ensure_chunks_loaded().await;
let chunks = self.chunks.read().await;
chunks.values().cloned().collect()
}
/// Paginated snapshot of chunks in a stable order (file path, then
/// `start_line`). Used by `GET /indexes/:id/chunks?offset=&limit=` and the
/// `list_chunks` MCP tool for batch iteration over the corpus.
///
/// Why: clients (sidecar analyzers, external tooling) need to page through
/// every chunk without loading the entire corpus into memory at once.
/// Deterministic ordering is required so successive pages don't overlap or
/// skip rows when the underlying `HashMap` re-shuffles between calls.
/// What: collects every `RawChunk`, sorts by `(file, start_line, end_line)`
/// for a total order, slices `[offset .. offset+limit]`, and materializes
/// each into a `CodeChunk` (same shape as `all_chunks`). Returns
/// `(total_chunks, page)` so the caller can serialize the `total` field
/// without a second pass.
/// Test: `test_enumerate_chunks_paginates_stable_order` indexes a couple of
/// files, pages through them, and asserts no overlap and full coverage.
pub async fn enumerate_chunks(&self, offset: usize, limit: usize) -> (usize, Vec<CodeChunk>) {
self.ensure_chunks_loaded().await;
let chunks = self.chunks.read().await;
let total = chunks.len();
if limit == 0 || offset >= total {
return (total, Vec::new());
}
let mut ordered: Vec<&RawChunk> = chunks.values().collect();
ordered.sort_by(|a, b| {
a.file
.cmp(&b.file)
.then(a.start_line.cmp(&b.start_line))
.then(a.end_line.cmp(&b.end_line))
});
let end = (offset + limit).min(total);
let root = self.root_path.clone();
let page: Vec<CodeChunk> = ordered[offset..end]
.iter()
.map(|raw| raw_to_code_chunk(raw, 0.0, "enumerate", None, &root))
.collect();
(total, page)
}
/// Run an HNSW-only similarity search against a precomputed embedding,
/// excluding `exclude_id` (typically the seed chunk). Returns up to
/// `top_k` `CodeChunk`s with `match_reason = "vector"`.
pub async fn similar_by_embedding(
&self,
embedding: &[f32],
top_k: usize,
exclude_id: Option<&str>,
) -> Result<Vec<CodeChunk>> {
let want = top_k.saturating_add(1).max(top_k);
let hits = self.vector_search(embedding, want).await?;
self.ensure_chunks_loaded().await;
let chunks = self.chunks.read().await;
let mut out = Vec::with_capacity(top_k);
for (id, score) in hits {
if Some(id.as_str()) == exclude_id {
continue;
}
let Some(raw) = chunks.get(&id) else { continue };
let snippet = Some(build_compact_snippet(&raw.content));
out.push(raw_to_code_chunk(
raw,
score,
"vector",
snippet,
&self.root_path,
));
if out.len() >= top_k {
break;
}
}
Ok(out)
}
/// Read-only access to the entity list for a file (None if never indexed).
pub async fn entities_for(
&self,
file_path: &str,
) -> Option<Vec<crate::core::entity::RawEntity>> {
self.entities.read().await.get(file_path).cloned()
}
/// Issue #20: exact-name entity lookup. Scans the in-memory entity index
/// for an entry whose text matches `query` (case-insensitive, trimmed) and
/// returns the chunk_id of a chunk in that entity's file whose source line
/// range contains the entity. Returns the first match found — fine for
/// rank-1 BM25 injection where we just need a strong anchor.
///
/// Restricted to `NamedType` and `ModulePath` entities — these are the
/// taxonomy members that behave like symbol names. Other entity types
/// (string literals, annotations, error variants) are noisier and should
/// not anchor an exact-match boost.
pub(super) async fn entity_exact_match(&self, query: &str) -> Option<String> {
let needle = query.trim();
if needle.is_empty() || needle.contains(' ') {
// Multi-word queries are not symbol names; skip the exact-match path.
return None;
}
self.ensure_chunks_loaded().await;
let entities = self.entities.read().await;
let chunks = self.chunks.read().await;
for (file, ents) in entities.iter() {
for ent in ents {
if !matches!(
ent.entity_type,
EntityType::NamedType | EntityType::ModulePath
) {
continue;
}
if ent.text.eq_ignore_ascii_case(needle) {
// Find a chunk in `file` whose [start_line, end_line] contains ent.line.
if let Some(c) = chunks
.values()
.filter(|c| c.file == *file)
.find(|c| ent.line >= c.start_line && ent.line <= c.end_line)
{
return Some(c.id.clone());
}
}
}
}
None
}
/// Return the raw text content of a chunk by its ID, or `None` if the
/// chunk is not in the corpus.
///
/// Why (issue #484): `search_similar` falls back to re-embedding a chunk's
/// text when the LRU embedding cache misses — which always happens for
/// `skip_kg=true` indexes because the cache is only populated on commit
/// (i.e. during reindex) and evicted entries are never restored. This
/// O(1) lookup lets the handler obtain the seed text without loading the
/// full corpus snapshot.
/// What: acquires a brief read lock on the in-memory `chunks` map (lazily
/// rehydrating from redb if it was evicted) and returns a clone of the
/// matching `RawChunk::content`.
/// Test: `test_chunk_content_by_id_returns_none_for_unknown` in
/// `indexer::tests`.
pub async fn chunk_content_by_id(&self, chunk_id: &str) -> Option<String> {
self.ensure_chunks_loaded().await;
let chunks = self.chunks.read().await;
chunks.get(chunk_id).map(|c| c.content.clone())
}
/// Remove every chunk belonging to a file, plus its entity list.
///
/// Why: `index-file` re-indexes a file in place, but file deletion (and
/// `FileWatcher` rename/remove events) needs to drop all of a file's
/// chunks at once. Returns the number of chunks removed.
pub async fn remove_file(&self, file_path: &str) -> Result<usize> {
// Rehydrate so an idle-evicted map still yields the file's chunk ids to
// remove (the redb delete below is keyed by those ids).
self.ensure_chunks_loaded().await;
let ids: Vec<String> = {
let chunks = self.chunks.read().await;
chunks
.values()
.filter(|c| c.file == file_path)
.map(|c| c.id.clone())
.collect()
};
let removed = ids.len();
self.remove_chunks_from_stores(&ids).await;
self.entities.write().await.remove(file_path);
// Issue #28: evict the file's entity list from the durable redb store
// too, or a restart would resurrect it into the symbol graph.
self.delete_entities_from_redb(file_path).await;
self.rebuild_symbol_graph().await;
Ok(removed)
}
/// Delete a file's entity list from the durable redb corpus (issue #28).
///
/// Why: `remove_file` drops the in-memory entity list; the redb store must
/// follow or a restart would rebuild a stale symbol graph. No-op when no
/// `CorpusStore` is wired (test / BM25-only indexers).
/// What: runs `CorpusStore::delete_entities` on a blocking worker (redb's
/// API is sync). Errors are logged at `warn`, never propagated —
/// persistence cleanup must not fail a live in-memory removal.
/// Test: covered by `tests::test_corpus_store_roundtrip` deletion paths.
async fn delete_entities_from_redb(&self, file_path: &str) {
let Some(corpus) = self.corpus.clone() else {
return;
};
let file = file_path.to_string();
let index_id = self.index_id.clone();
match tokio::task::spawn_blocking(move || corpus.delete_entities(&file)).await {
Ok(Ok(())) => {}
Ok(Err(e)) => {
tracing::warn!("index '{index_id}': redb entity delete failed ({e})")
}
Err(e) => {
tracing::warn!("index '{index_id}': redb entity delete task panicked ({e})")
}
}
}
/// Delete a set of chunk ids from the durable redb corpus (issue #28).
///
/// Why: `remove_chunk` / `remove_file` evict chunks from every in-memory
/// structure; the redb store must follow or a restart resurrects them.
/// What: runs `CorpusStore::delete_chunks` on a blocking worker. Errors are
/// logged, never propagated.
/// Test: covered by `tests::test_corpus_store_roundtrip` deletion paths.
async fn delete_chunks_from_redb(&self, ids: &[String]) {
let Some(corpus) = self.corpus.clone() else {
return;
};
if ids.is_empty() {
return;
}
let ids = ids.to_vec();
let index_id = self.index_id.clone();
match tokio::task::spawn_blocking(move || corpus.delete_chunks(&ids)).await {
Ok(Ok(())) => {}
Ok(Err(e)) => {
tracing::warn!("index '{index_id}': redb chunk delete failed ({e})")
}
Err(e) => {
tracing::warn!("index '{index_id}': redb chunk delete task panicked ({e})")
}
}
}
/// Remove every chunk id from the HNSW store, corpus, embedding cache,
/// and BM25 index.
///
/// Why: shared between `remove_file` (bulk per-file deletion) and could
/// be reused for future bulk-deletion paths. Each lock is acquired once
/// for the whole batch to bound write-lock contention.
/// What: best-effort `store.remove` per id (swallows store errors —
/// HNSW deletion is non-fatal in this codebase), then drops the id from
/// each in-memory structure under a single write lock per structure.
/// Test: covered indirectly by `test_remove_chunk_removes_from_results`.
async fn remove_chunks_from_stores(&self, ids: &[String]) {
if let Some(store) = &self.store {
for id in ids {
store.remove(id).await.ok();
}
}
{
let mut chunks = self.chunks.write().await;
for id in ids {
chunks.remove(id);
}
}
{
let mut emb = self.chunk_embeddings.write().await;
for id in ids {
emb.pop(id);
}
}
{
let mut bm25 = self.bm25.write().await;
for id in ids {
bm25.remove_document(id);
}
}
// Issue #28: mirror the deletion into the durable redb corpus.
self.delete_chunks_from_redb(ids).await;
}
/// Remove a chunk from the corpus and its vector from the HNSW store.
pub async fn remove_chunk(&self, chunk_id: &str) -> Result<()> {
if let Some(store) = &self.store {
store.remove(chunk_id).await.ok();
}
self.chunks.write().await.remove(chunk_id);
self.chunk_embeddings.write().await.pop(chunk_id);
self.bm25.write().await.remove_document(chunk_id);
// Issue #28: mirror the deletion into the durable redb corpus.
self.delete_chunks_from_redb(&[chunk_id.to_string()]).await;
self.rebuild_symbol_graph().await;
Ok(())
}
}