trusty-search 0.27.2

Machine-wide hybrid code search service: BM25 + vector + KG, zero cold-start, MCP server
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
//! File-level operations on [`CodeIndexer`]: removal, lookup, and entity access.
//!
//! Why: chunk removal (single id or whole file) and entity lookups are
//! orthogonal to the search/ingest hot paths. Lifting them out keeps each
//! `impl` block focused on a single concern.
//! What: `remove_file`, `remove_chunk`, the shared `remove_chunks_from_stores`
//! helper, `find_chunk_id`, `entities_for`, and `entity_exact_match`.
//! Test: covered by `test_remove_chunk_removes_from_results`,
//! `test_entity_exact_match_*` in `indexer::tests`.

use anyhow::Result;

use crate::core::chunker::RawChunk;
use crate::core::entity::EntityType;

use super::{build_compact_snippet, raw_to_code_chunk, CodeChunk, CodeIndexer};

impl CodeIndexer {
    /// Find a chunk whose `file` ends with `file_suffix` and (optionally) whose
    /// `function_name` equals `function`. When `function` is `None`, returns
    /// the lowest-line-numbered chunk in the matching file. Returns the chunk
    /// id, or `None` when nothing matches.
    pub async fn find_chunk_id(&self, file_suffix: &str, function: Option<&str>) -> Option<String> {
        self.ensure_chunks_loaded().await;
        let chunks = self.chunks.read().await;
        let matching: Vec<&RawChunk> = chunks
            .values()
            .filter(|c| c.file.ends_with(file_suffix))
            .filter(|c| match function {
                Some(f) => c.function_name.as_deref() == Some(f),
                None => true,
            })
            .collect();
        // Pick the earliest chunk in the file for stability.
        matching
            .into_iter()
            .min_by_key(|c| c.start_line)
            .map(|c| c.id.clone())
    }

    /// Snapshot every chunk in the corpus as a `CodeChunk`. Used by the
    /// quality / complexity endpoints (issue #32) which need to materialize
    /// per-chunk metrics without going through the search pipeline.
    pub async fn all_chunks(&self) -> Vec<CodeChunk> {
        self.ensure_chunks_loaded().await;
        let chunks = self.chunks.read().await;
        let root = self.root_path.clone();
        chunks
            .values()
            .map(|raw| raw_to_code_chunk(raw, 0.0, "all", None, &root))
            .collect()
    }

    /// Snapshot every `RawChunk` in the corpus (issue #76).
    ///
    /// Why: the `get_call_chain` tool needs the full source body and doc
    /// comments of every candidate function, not the projected `CodeChunk`
    /// shape returned by [`Self::all_chunks`]. Returning `RawChunk` clones
    /// keeps the read lock window tiny and lets the caller process chunks
    /// without holding any indexer lock.
    /// What: clones every `RawChunk` while briefly holding the read lock.
    /// Test: covered by `service::call_chain::tests`.
    pub async fn raw_chunks_snapshot(&self) -> Vec<RawChunk> {
        self.ensure_chunks_loaded().await;
        let chunks = self.chunks.read().await;
        chunks.values().cloned().collect()
    }

    /// Paginated snapshot of chunks in a stable order (file path, then
    /// `start_line`). Used by `GET /indexes/:id/chunks?offset=&limit=` and the
    /// `list_chunks` MCP tool for batch iteration over the corpus.
    ///
    /// Why: clients (sidecar analyzers, external tooling) need to page through
    /// every chunk without loading the entire corpus into memory at once.
    /// Deterministic ordering is required so successive pages don't overlap or
    /// skip rows when the underlying `HashMap` re-shuffles between calls.
    /// What: collects every `RawChunk`, sorts by `(file, start_line, end_line)`
    /// for a total order, slices `[offset .. offset+limit]`, and materializes
    /// each into a `CodeChunk` (same shape as `all_chunks`). Returns
    /// `(total_chunks, page)` so the caller can serialize the `total` field
    /// without a second pass.
    /// Test: `test_enumerate_chunks_paginates_stable_order` indexes a couple of
    /// files, pages through them, and asserts no overlap and full coverage.
    pub async fn enumerate_chunks(&self, offset: usize, limit: usize) -> (usize, Vec<CodeChunk>) {
        self.ensure_chunks_loaded().await;
        let chunks = self.chunks.read().await;
        let total = chunks.len();
        if limit == 0 || offset >= total {
            return (total, Vec::new());
        }
        let mut ordered: Vec<&RawChunk> = chunks.values().collect();
        ordered.sort_by(|a, b| {
            a.file
                .cmp(&b.file)
                .then(a.start_line.cmp(&b.start_line))
                .then(a.end_line.cmp(&b.end_line))
        });
        let end = (offset + limit).min(total);
        let root = self.root_path.clone();
        let page: Vec<CodeChunk> = ordered[offset..end]
            .iter()
            .map(|raw| raw_to_code_chunk(raw, 0.0, "enumerate", None, &root))
            .collect();
        (total, page)
    }

    /// Cursor-paginate the chunk corpus in ascending `chunk_id` order, doing an
    /// indexed B-tree seek instead of a full-corpus scan (issue #1325).
    ///
    /// Why: [`Self::enumerate_chunks`] loads every chunk and re-sorts the whole
    /// corpus on every page request — O(N log N) per page — which times out
    /// (and 502s behind a proxy) at deep offsets on large indexes
    /// (`offset=304000`). When a durable [`CorpusStore`] is wired, this method
    /// instead seeks straight to the cursor in redb's `chunk_id`-keyed B-tree
    /// and reads one page: O(log N) + O(page) per call, so a forward scan over
    /// the whole corpus is O(N) total rather than O(N²/page). Indexers without
    /// a durable corpus (BM25-only / tests) fall back to the in-memory map,
    /// reproducing the cursor (exclusive `after`, ascending id) semantics over
    /// the same `(file, start_line, end_line)` ordering used elsewhere — note
    /// this differs from the redb path's pure `id` ordering, but both are
    /// stable total orders, which is all a cursor requires.
    /// What: returns `(total, page, next_cursor)`. `total` is the corpus chunk
    /// count (cheap `CorpusStore::chunk_count`, or the in-memory length).
    /// `page` is up to `limit` materialized [`CodeChunk`]s strictly after
    /// `after`. `next_cursor` is `Some(last_id)` when a full `limit`-sized page
    /// was returned (more rows may follow) and `None` once the page is short
    /// (end reached) — so a client loops until `next_cursor` is `None`.
    /// Test: `test_enumerate_chunks_after_cursor_pages_via_redb` and
    /// `test_enumerate_chunks_after_cursor_in_memory_fallback`.
    pub async fn enumerate_chunks_after(
        &self,
        after: Option<&str>,
        limit: usize,
    ) -> (usize, Vec<CodeChunk>, Option<String>) {
        let root = self.root_path.clone();
        // Durable path: indexed seek over redb, no full-corpus materialization.
        if let Some(corpus) = self.corpus.clone() {
            let total = corpus.chunk_count().unwrap_or(0);
            if limit == 0 || total == 0 {
                return (total, Vec::new(), None);
            }
            let after_owned = after.map(str::to_string);
            let raws = tokio::task::spawn_blocking(move || {
                corpus.chunks_after(after_owned.as_deref(), limit)
            })
            .await;
            let raws = match raws {
                Ok(Ok(raws)) => raws,
                Ok(Err(e)) => {
                    tracing::warn!("index '{}': cursor page read failed ({e})", self.index_id);
                    return (total, Vec::new(), None);
                }
                Err(e) => {
                    tracing::warn!("index '{}': cursor page task panicked ({e})", self.index_id);
                    return (total, Vec::new(), None);
                }
            };
            let next_cursor = if raws.len() == limit {
                raws.last().map(|r| r.id.clone())
            } else {
                None
            };
            let page: Vec<CodeChunk> = raws
                .iter()
                .map(|raw| raw_to_code_chunk(raw, 0.0, "enumerate", None, &root))
                .collect();
            return (total, page, next_cursor);
        }

        // In-memory fallback (no durable corpus): reproduce the redb path's
        // cursor semantics by ordering on `chunk_id` alone, so the exclusive
        // `after` cursor is monotonic with the sort and `partition_point` finds
        // the resume point correctly. This is a different (but equally stable)
        // total order from `enumerate_chunks`'s (file, start_line) ordering —
        // the cursor path only requires internal consistency.
        self.ensure_chunks_loaded().await;
        let chunks = self.chunks.read().await;
        let total = chunks.len();
        if limit == 0 || total == 0 {
            return (total, Vec::new(), None);
        }
        let mut ordered: Vec<&RawChunk> = chunks.values().collect();
        ordered.sort_by(|a, b| a.id.cmp(&b.id));
        let start = match after {
            Some(cursor) => ordered.partition_point(|r| r.id.as_str() <= cursor),
            None => 0,
        };
        let end = (start + limit).min(ordered.len());
        let slice = &ordered[start..end];
        let next_cursor = if slice.len() == limit {
            slice.last().map(|r| r.id.clone())
        } else {
            None
        };
        let page: Vec<CodeChunk> = slice
            .iter()
            .map(|raw| raw_to_code_chunk(raw, 0.0, "enumerate", None, &root))
            .collect();
        (total, page, next_cursor)
    }

    /// Run an HNSW-only similarity search against a precomputed embedding,
    /// excluding `exclude_id` (typically the seed chunk). Returns up to
    /// `top_k` `CodeChunk`s with `match_reason = "vector"`.
    pub async fn similar_by_embedding(
        &self,
        embedding: &[f32],
        top_k: usize,
        exclude_id: Option<&str>,
    ) -> Result<Vec<CodeChunk>> {
        let want = top_k.saturating_add(1).max(top_k);
        let hits = self.vector_search(embedding, want).await?;
        self.ensure_chunks_loaded().await;
        let chunks = self.chunks.read().await;
        let mut out = Vec::with_capacity(top_k);
        for (id, score) in hits {
            if Some(id.as_str()) == exclude_id {
                continue;
            }
            let Some(raw) = chunks.get(&id) else { continue };
            let snippet = Some(build_compact_snippet(&raw.content));
            out.push(raw_to_code_chunk(
                raw,
                score,
                "vector",
                snippet,
                &self.root_path,
            ));
            if out.len() >= top_k {
                break;
            }
        }
        Ok(out)
    }

    /// Read-only access to the entity list for a file (None if never indexed).
    pub async fn entities_for(
        &self,
        file_path: &str,
    ) -> Option<Vec<crate::core::entity::RawEntity>> {
        self.entities.read().await.get(file_path).cloned()
    }

    /// Issue #20: exact-name entity lookup. Scans the in-memory entity index
    /// for an entry whose text matches `query` (case-insensitive, trimmed) and
    /// returns the chunk_id of a chunk in that entity's file whose source line
    /// range contains the entity. Returns the first match found — fine for
    /// rank-1 BM25 injection where we just need a strong anchor.
    ///
    /// Restricted to `NamedType` and `ModulePath` entities — these are the
    /// taxonomy members that behave like symbol names. Other entity types
    /// (string literals, annotations, error variants) are noisier and should
    /// not anchor an exact-match boost.
    pub(super) async fn entity_exact_match(&self, query: &str) -> Option<String> {
        let needle = query.trim();
        if needle.is_empty() || needle.contains(' ') {
            // Multi-word queries are not symbol names; skip the exact-match path.
            return None;
        }
        self.ensure_chunks_loaded().await;
        let entities = self.entities.read().await;
        let chunks = self.chunks.read().await;
        for (file, ents) in entities.iter() {
            for ent in ents {
                if !matches!(
                    ent.entity_type,
                    EntityType::NamedType | EntityType::ModulePath
                ) {
                    continue;
                }
                if ent.text.eq_ignore_ascii_case(needle) {
                    // Find a chunk in `file` whose [start_line, end_line] contains ent.line.
                    if let Some(c) = chunks
                        .values()
                        .filter(|c| c.file == *file)
                        .find(|c| ent.line >= c.start_line && ent.line <= c.end_line)
                    {
                        return Some(c.id.clone());
                    }
                }
            }
        }
        None
    }

    /// Return the raw text content of a chunk by its ID, or `None` if the
    /// chunk is not in the corpus.
    ///
    /// Why (issue #484): `search_similar` falls back to re-embedding a chunk's
    /// text when the LRU embedding cache misses — which always happens for
    /// `skip_kg=true` indexes because the cache is only populated on commit
    /// (i.e. during reindex) and evicted entries are never restored.  This
    /// O(1) lookup lets the handler obtain the seed text without loading the
    /// full corpus snapshot.
    /// What: acquires a brief read lock on the in-memory `chunks` map (lazily
    /// rehydrating from redb if it was evicted) and returns a clone of the
    /// matching `RawChunk::content`.
    /// Test: `test_chunk_content_by_id_returns_none_for_unknown` in
    /// `indexer::tests`.
    pub async fn chunk_content_by_id(&self, chunk_id: &str) -> Option<String> {
        self.ensure_chunks_loaded().await;
        let chunks = self.chunks.read().await;
        chunks.get(chunk_id).map(|c| c.content.clone())
    }

    /// Remove every chunk belonging to a file and its entity list WITHOUT
    /// triggering a symbol-graph rebuild (issue #848 prune pass).
    ///
    /// Why: the prune pass in `service::reindex` removes multiple deleted files
    /// in a loop. Calling `remove_file` per file would trigger O(deleted_files)
    /// full KG rebuilds, which is expensive. The reindex orchestrator already
    /// rebuilds the KG once at the end of Phase 3, so the per-file rebuild is
    /// redundant. This method is identical to `remove_file` except it skips the
    /// `rebuild_symbol_graph` call, leaving the graph stale until the orchestrator's
    /// Phase 3 rebuild corrects it.
    /// What: removes chunk rows, entity row, and in-memory entity map entry for
    /// `file_path`. Returns the number of chunks removed.
    /// Test: covered by `prune_deleted_files_cleans_staging_corpus` in
    /// `service::reindex::tests`.
    pub(crate) async fn remove_file_no_kg_rebuild(&self, file_path: &str) -> Result<usize> {
        self.ensure_chunks_loaded().await;
        let ids: Vec<String> = {
            let chunks = self.chunks.read().await;
            chunks
                .values()
                .filter(|c| c.file == file_path)
                .map(|c| c.id.clone())
                .collect()
        };
        let removed = ids.len();
        self.remove_chunks_from_stores(&ids).await;
        self.entities.write().await.remove(file_path);
        self.delete_entities_from_redb(file_path).await;
        // NOTE: deliberately omits `self.rebuild_symbol_graph().await` —
        // the caller (prune pass) handles the rebuild once after all files.
        Ok(removed)
    }

    /// Remove every chunk belonging to a file, plus its entity list.
    ///
    /// Why: `index-file` re-indexes a file in place, but file deletion (and
    /// `FileWatcher` rename/remove events) needs to drop all of a file's
    /// chunks at once. Returns the number of chunks removed.
    pub async fn remove_file(&self, file_path: &str) -> Result<usize> {
        // Rehydrate so an idle-evicted map still yields the file's chunk ids to
        // remove (the redb delete below is keyed by those ids).
        self.ensure_chunks_loaded().await;
        let ids: Vec<String> = {
            let chunks = self.chunks.read().await;
            chunks
                .values()
                .filter(|c| c.file == file_path)
                .map(|c| c.id.clone())
                .collect()
        };
        let removed = ids.len();
        self.remove_chunks_from_stores(&ids).await;
        self.entities.write().await.remove(file_path);
        // Issue #28: evict the file's entity list from the durable redb store
        // too, or a restart would resurrect it into the symbol graph.
        self.delete_entities_from_redb(file_path).await;
        self.rebuild_symbol_graph().await;
        Ok(removed)
    }

    /// Delete a file's entity list from the durable redb corpus (issue #28).
    ///
    /// Why: `remove_file` drops the in-memory entity list; the redb store must
    /// follow or a restart would rebuild a stale symbol graph. No-op when no
    /// `CorpusStore` is wired (test / BM25-only indexers).
    /// What: runs `CorpusStore::delete_entities` on a blocking worker (redb's
    /// API is sync). Errors are logged at `warn`, never propagated —
    /// persistence cleanup must not fail a live in-memory removal.
    /// Test: covered by `tests::test_corpus_store_roundtrip` deletion paths.
    async fn delete_entities_from_redb(&self, file_path: &str) {
        let Some(corpus) = self.corpus.clone() else {
            return;
        };
        let file = file_path.to_string();
        let index_id = self.index_id.clone();
        match tokio::task::spawn_blocking(move || corpus.delete_entities(&file)).await {
            Ok(Ok(())) => {}
            Ok(Err(e)) => {
                tracing::warn!("index '{index_id}': redb entity delete failed ({e})")
            }
            Err(e) => {
                tracing::warn!("index '{index_id}': redb entity delete task panicked ({e})")
            }
        }
    }

    /// Delete a set of chunk ids from the durable redb corpus (issue #28).
    ///
    /// Why: `remove_chunk` / `remove_file` evict chunks from every in-memory
    /// structure; the redb store must follow or a restart resurrects them.
    /// What: runs `CorpusStore::delete_chunks` on a blocking worker. Errors are
    /// logged, never propagated.
    /// Test: covered by `tests::test_corpus_store_roundtrip` deletion paths.
    async fn delete_chunks_from_redb(&self, ids: &[String]) {
        let Some(corpus) = self.corpus.clone() else {
            return;
        };
        if ids.is_empty() {
            return;
        }
        let ids = ids.to_vec();
        let index_id = self.index_id.clone();
        match tokio::task::spawn_blocking(move || corpus.delete_chunks(&ids)).await {
            Ok(Ok(())) => {}
            Ok(Err(e)) => {
                tracing::warn!("index '{index_id}': redb chunk delete failed ({e})")
            }
            Err(e) => {
                tracing::warn!("index '{index_id}': redb chunk delete task panicked ({e})")
            }
        }
    }

    /// Remove every chunk id from the HNSW store, corpus, embedding cache,
    /// and BM25 index.
    ///
    /// Why: shared between `remove_file` (bulk per-file deletion) and could
    /// be reused for future bulk-deletion paths. Each lock is acquired once
    /// for the whole batch to bound write-lock contention.
    /// What: best-effort `store.remove` per id (swallows store errors —
    /// HNSW deletion is non-fatal in this codebase), then drops the id from
    /// each in-memory structure under a single write lock per structure.
    /// Test: covered indirectly by `test_remove_chunk_removes_from_results`.
    async fn remove_chunks_from_stores(&self, ids: &[String]) {
        if let Some(store) = &self.store {
            for id in ids {
                store.remove(id).await.ok();
            }
        }
        {
            let mut chunks = self.chunks.write().await;
            for id in ids {
                chunks.remove(id);
            }
        }
        {
            let mut emb = self.chunk_embeddings.write().await;
            for id in ids {
                emb.pop(id);
            }
        }
        {
            let mut bm25 = self.bm25.write().await;
            for id in ids {
                bm25.remove_document(id);
            }
        }
        // Issue #28: mirror the deletion into the durable redb corpus.
        self.delete_chunks_from_redb(ids).await;
    }

    /// Remove a chunk from the corpus and its vector from the HNSW store.
    pub async fn remove_chunk(&self, chunk_id: &str) -> Result<()> {
        if let Some(store) = &self.store {
            store.remove(chunk_id).await.ok();
        }
        self.chunks.write().await.remove(chunk_id);
        self.chunk_embeddings.write().await.pop(chunk_id);
        self.bm25.write().await.remove_document(chunk_id);
        // Issue #28: mirror the deletion into the durable redb corpus.
        self.delete_chunks_from_redb(&[chunk_id.to_string()]).await;
        self.rebuild_symbol_graph().await;
        Ok(())
    }
}