1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
pub mod bm25;
pub mod disk_cache;
pub mod hasher;
pub mod parser;
pub mod ranker;
pub mod scanner;
use std::collections::HashMap;
use std::path::{Path, PathBuf};
use crate::repo_map::bm25::{Bm25Index, IndexUpdate};
use crate::repo_map::hasher::{ChangeStatus, FileHashCache};
use crate::repo_map::parser::{Symbol, SymbolKind};
use crate::repo_map::ranker::SymbolRanker;
use crate::repo_map::scanner::scan_source_files;
/// The complete repository map with incremental rebuild support.
pub struct RepoMap {
/// Per-file symbol cache.
symbols: HashMap<PathBuf, Vec<Symbol>>,
/// BLAKE3-based change detection.
hash_cache: FileHashCache,
/// PageRank-inspired symbol ranker.
ranker: SymbolRanker,
/// BM25 full-text search index.
bm25: Bm25Index,
/// The root directory of the project.
root: PathBuf,
/// Max approximate tokens for the map output.
max_tokens: usize,
/// Whether this is the first build (no prior index exists).
first_build: bool,
/// Cached output of `to_map_string()`. Invalidated on rebuild.
/// Uses `Mutex` so the cache can be populated from `&self` calls
/// while remaining `Sync` (RepoMap is shared via RwLock across threads).
cached_map_string: std::sync::Mutex<Option<String>>,
}
impl RepoMap {
pub fn new(root: &Path) -> Self {
// Attempt to restore from disk cache.
if let Some((symbols, hash_cache, bm25, ranker)) = disk_cache::load(root) {
tracing::info!(
files = symbols.len(),
"RepoMap restored from disk cache — incremental rebuild only",
);
return Self {
symbols,
hash_cache,
ranker,
bm25,
root: root.to_path_buf(),
max_tokens: 2000,
first_build: false, // skip full BM25 build
cached_map_string: std::sync::Mutex::new(None),
};
}
Self {
symbols: HashMap::new(),
hash_cache: FileHashCache::new(),
ranker: SymbolRanker::new(),
bm25: Bm25Index::new(),
root: root.to_path_buf(),
max_tokens: 2000,
first_build: true,
cached_map_string: std::sync::Mutex::new(None),
}
}
/// Build or incrementally update the repo map. Returns number of files re-parsed.
pub fn rebuild(&mut self) -> usize {
use rayon::prelude::*;
let files = scan_source_files(&self.root);
// Detect deleted files
let current_files: std::collections::HashSet<_> = files.iter().cloned().collect();
self.symbols.retain(|k, _| current_files.contains(k));
// Collect BM25 incremental updates
let mut bm25_updates: Vec<IndexUpdate> = Vec::new();
// Step 1 (serial): determine which files need re-parsing via hash check
let mut to_parse: Vec<std::path::PathBuf> = Vec::new();
for file in &files {
let status = self.hash_cache.check_file(file);
match status {
ChangeStatus::Unchanged => continue,
ChangeStatus::Modified | ChangeStatus::New => {
to_parse.push(file.clone());
bm25_updates.push(IndexUpdate::Upsert(file.clone()));
}
ChangeStatus::Error => {
tracing::warn!("Failed to check file: {}", file.display());
}
}
}
// Step 2 (parallel): parse all changed files concurrently
let parsed: Vec<(std::path::PathBuf, Vec<Symbol>)> = to_parse
.par_iter()
.filter_map(|file| {
parser::parse_file(file)
.ok()
.map(|syms| (file.clone(), syms))
})
.collect();
// Step 3 (serial): update symbol map
let re_parsed = parsed.len();
for (file, symbols) in parsed {
self.symbols.insert(file, symbols);
}
// Rebuild reference counts for ranking
if re_parsed > 0 {
self.ranker.build_references(&self.symbols, &self.root);
}
// BM25: full build on first run, incremental thereafter
if self.first_build {
self.bm25.build(&files, &self.root);
self.first_build = false;
} else {
// Detect deleted files: compare indexed paths to the current file set.
// Emit IndexUpdate::Remove for any previously-indexed file no longer present.
let indexed_paths: std::collections::HashSet<std::path::PathBuf> = self
.bm25
.indexed_files()
.into_iter()
.map(|rel| self.root.join(rel))
.collect();
for deleted in indexed_paths.difference(¤t_files) {
if self.bm25.contains(deleted) {
bm25_updates.push(IndexUpdate::Remove(deleted.clone()));
}
}
if !bm25_updates.is_empty() {
self.bm25.update(&bm25_updates, &self.root);
}
// Belt-and-suspenders: evict any stale docs that may have been
// missed by the incremental path (e.g. external file deletions
// between two rebuild() calls).
self.bm25.retain_files(¤t_files);
}
// Invalidate cached map string — will be lazily recomputed.
if re_parsed > 0 {
*self.cached_map_string.lock().unwrap() = None;
}
tracing::info!(
"Repo map: {} files, {} re-parsed, BM25: {} docs / {} terms",
self.symbols.len(),
re_parsed,
self.bm25.doc_count(),
self.bm25.term_count(),
);
// Persist to disk cache (best-effort, non-blocking for callers).
if (re_parsed > 0 || self.symbols.is_empty())
&& let Err(e) = disk_cache::save(
&self.root,
&self.symbols,
&self.hash_cache,
&self.bm25,
&self.ranker,
)
{
tracing::warn!("Failed to save RepoMap disk cache: {e}");
}
re_parsed
}
/// Iterate over all (file, symbols) pairs.
pub fn all_symbols(&self) -> impl Iterator<Item = (&PathBuf, &Vec<Symbol>)> {
self.symbols.iter()
}
/// Set conversation-relevant files for priority ranking.
pub fn set_conversation_files(&mut self, files: Vec<PathBuf>) {
self.ranker.set_conversation_files(files);
// Ranking changed — invalidate cached string.
*self.cached_map_string.lock().unwrap() = None;
}
/// Invalidate cache for a specific file (after tool edits it).
///
/// Also removes the file from the BM25 index if it was indexed,
/// so search results don't reflect stale content.
pub fn invalidate(&mut self, path: &Path) {
self.hash_cache.invalidate(path);
*self.cached_map_string.lock().unwrap() = None;
// Remove from BM25 index so stale content isn't returned by search.
if self.bm25.contains(path) {
let indexed = self.bm25.indexed_files().len();
self.bm25
.update(&[bm25::IndexUpdate::Remove(path.to_path_buf())], &self.root);
tracing::trace!(
path = %path.display(),
was_indexed = indexed > 0,
"BM25: removed invalidated file"
);
}
}
/// Generate the map text for LLM context injection, ranked by importance.
/// Result is cached until the next `rebuild()` modifies symbols.
pub fn to_map_string(&self) -> String {
{
let cached = self.cached_map_string.lock().unwrap();
if let Some(ref s) = *cached {
return s.clone();
}
}
let result = self.to_map_string_inner();
*self.cached_map_string.lock().unwrap() = Some(result.clone());
result
}
fn to_map_string_inner(&self) -> String {
let mut output = String::new();
let mut approx_tokens = 0;
// Rank files by PageRank-style importance
let ranked = self.ranker.rank_files(&self.symbols);
for (file, symbols, _score) in ranked {
if approx_tokens >= self.max_tokens {
output.push_str("\n[... truncated to fit context window]\n");
break;
}
if symbols.is_empty() {
continue;
}
let rel = file.strip_prefix(&self.root).unwrap_or(file);
let file_header = format!("{}:\n", rel.display());
output.push_str(&file_header);
approx_tokens += file_header.len() / 4;
for sym in symbols {
let kind_str = match sym.kind {
SymbolKind::Function => "fn",
SymbolKind::Struct => "struct",
SymbolKind::Enum => "enum",
SymbolKind::Trait => "trait",
SymbolKind::Impl => "impl",
SymbolKind::Const => "const",
SymbolKind::Type => "type",
SymbolKind::Mod => "mod",
SymbolKind::Macro => "macro",
};
let line = if let Some(ref sig) = sym.signature {
format!(" {kind_str} {} [L{}] {sig}\n", sym.name, sym.line)
} else {
format!(" {kind_str} {} [L{}]\n", sym.name, sym.line)
};
approx_tokens += line.len() / 4;
output.push_str(&line);
}
}
output
}
/// Total number of symbols tracked.
pub fn symbol_count(&self) -> usize {
self.symbols.values().map(|v| v.len()).sum()
}
/// Total number of files tracked.
pub fn file_count(&self) -> usize {
self.symbols.len()
}
/// BM25 search: find relevant files for a query string.
pub fn search(&self, query: &str, max_results: usize) -> Vec<bm25::SearchResult> {
self.bm25.search(query, max_results)
}
/// Get relevant files for AI context injection based on user query.
///
/// Returns `(relative_path, score)` pairs ranked by BM25 relevance.
pub fn relevant_files_for_query(&self, query: &str, top_k: usize) -> Vec<(String, f64)> {
self.bm25.relevant_files(query, top_k)
}
/// Access the BM25 index directly.
pub fn bm25(&self) -> &Bm25Index {
&self.bm25
}
/// Returns true if the repo map has been built at least once.
pub fn is_ready(&self) -> bool {
!self.first_build
}
}