1mod graph;
4
5const TOKENS_PER_SYMBOL: u32 = 25;
15
16const TOKENS_PER_FILE: u32 = 10;
18
19const TOKEN_OVERHEAD: u32 = 100;
21
22const BUDGET_SYMBOL_DIVISOR: usize = 30;
25const SUMMARY_MAX_LEN: usize = 120;
26
27fn summarize_symbol(symbol: &Symbol) -> Option<String> {
28 let line = symbol
29 .docstring
30 .as_deref()
31 .and_then(first_nonempty_line)
32 .or_else(|| symbol.signature.as_deref().and_then(first_nonempty_line));
33
34 line.map(|text| truncate_summary(text, SUMMARY_MAX_LEN))
35}
36
37fn first_nonempty_line(text: &str) -> Option<&str> {
38 text.lines()
39 .map(|line| line.trim())
40 .find(|line| !line.is_empty())
41}
42
43fn truncate_summary(text: &str, max_len: usize) -> String {
44 if text.chars().count() <= max_len {
45 return text.to_owned();
46 }
47
48 let mut truncated: String = text.chars().take(max_len).collect();
49 truncated = truncated.trim_end().to_owned();
50 truncated.push_str("...");
51 truncated
52}
53
54#[cfg(test)]
55use crate::types::RepoFile;
56use crate::types::{Repository, Symbol, SymbolKind, TokenizerModel};
57use graph::SymbolGraph;
58use serde::Serialize;
59use std::collections::HashMap;
60use std::fmt;
61use typed_builder::TypedBuilder;
62
63#[derive(Debug, Clone, Serialize)]
65pub struct RepoMap {
66 pub summary: String,
68 pub key_symbols: Vec<RankedSymbol>,
70 pub module_graph: ModuleGraph,
72 pub file_index: Vec<FileIndexEntry>,
74 pub token_count: u32,
76}
77
78impl fmt::Display for RepoMap {
79 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
80 write!(
81 f,
82 "RepoMap({} symbols, {} files, {} tokens)",
83 self.key_symbols.len(),
84 self.file_index.len(),
85 self.token_count
86 )
87 }
88}
89
90#[derive(Debug, Clone, Serialize)]
92pub struct RankedSymbol {
93 pub name: String,
95 pub kind: String,
97 pub file: String,
99 pub line: u32,
101 pub signature: Option<String>,
103 pub summary: Option<String>,
105 pub references: u32,
107 pub rank: u32,
109 pub importance: f32,
111}
112
113#[derive(Debug, Clone, Serialize)]
115pub struct ModuleGraph {
116 pub nodes: Vec<ModuleNode>,
118 pub edges: Vec<ModuleEdge>,
120}
121
122#[derive(Debug, Clone, Serialize)]
124pub struct ModuleNode {
125 pub name: String,
127 pub files: u32,
129 pub tokens: u32,
131}
132
133#[derive(Debug, Clone, Serialize)]
135pub struct ModuleEdge {
136 pub from: String,
138 pub to: String,
140 pub weight: u32,
142}
143
144#[derive(Debug, Clone, Serialize)]
146pub struct FileIndexEntry {
147 pub path: String,
149 pub tokens: u32,
151 pub importance: String,
153 pub summary: Option<String>,
155}
156
157#[derive(Debug, Clone, TypedBuilder)]
174pub struct RepoMapGenerator {
175 #[builder(default, setter(strip_option))]
177 max_symbols: Option<usize>,
178
179 #[builder(default = TokenizerModel::Claude)]
181 model: TokenizerModel,
182
183 #[builder(default = 2000)]
185 token_budget: u32,
186}
187
188impl RepoMapGenerator {
189 pub fn new(token_budget: u32) -> Self {
193 Self { max_symbols: None, model: TokenizerModel::Claude, token_budget }
194 }
195
196 fn effective_max_symbols(&self) -> usize {
198 self.max_symbols.unwrap_or_else(|| {
199 ((self.token_budget as usize) / BUDGET_SYMBOL_DIVISOR).clamp(10, 200)
201 })
202 }
203
204 #[must_use = "repository map should be used or formatted"]
214 pub fn generate(&self, repo: &Repository) -> RepoMap {
215 let mut graph = SymbolGraph::new();
217 for file in &repo.files {
218 graph.add_file(file);
219 }
220
221 let symbol_index = self.build_symbol_index(repo);
223
224 self.extract_references_fast(&mut graph, repo, &symbol_index);
226
227 let ranks = graph.compute_pagerank(0.85, 20); let key_symbols = self.build_ranked_symbols_fast(&graph, &ranks);
232
233 let module_graph = self.build_module_graph(repo);
235
236 let file_index = self.build_file_index(repo, key_symbols.len());
238
239 let summary = self.generate_summary(repo, &key_symbols);
241
242 let token_count = self.estimate_tokens(&key_symbols, &file_index);
244
245 RepoMap { summary, key_symbols, module_graph, file_index, token_count }
246 }
247
248 fn build_symbol_index(&self, repo: &Repository) -> HashMap<String, Vec<String>> {
253 let mut index: HashMap<String, Vec<String>> = HashMap::new();
254 for file in &repo.files {
255 let path_key = Self::strip_language_extension(&file.relative_path);
257
258 for symbol in &file.symbols {
259 let symbol_key = format!("{}:{}", file.relative_path, symbol.name);
260
261 index
263 .entry(symbol.name.clone())
264 .or_default()
265 .push(symbol_key.clone());
266
267 index
270 .entry(path_key.to_owned())
271 .or_default()
272 .push(symbol_key);
273 }
274 }
275 index
276 }
277
278 fn extract_references_fast(
283 &self,
284 graph: &mut SymbolGraph,
285 repo: &Repository,
286 index: &HashMap<String, Vec<String>>,
287 ) {
288 for file in &repo.files {
289 for symbol in &file.symbols {
290 let from_key = format!("{}:{}", file.relative_path, symbol.name);
291
292 if symbol.kind == SymbolKind::Import {
294 if let Some(targets) = index.get(&symbol.name) {
296 for target in targets {
297 if target != &from_key {
299 graph.add_reference(&from_key, target, graph::EdgeType::Imports);
300 }
301 }
302 }
303 }
304
305 if let Some(ref base_class) = symbol.extends {
307 if let Some(targets) = index.get(base_class) {
309 for target in targets {
310 if target != &from_key {
311 graph.add_reference(&from_key, target, graph::EdgeType::Inherits);
312 }
313 }
314 } else {
315 graph.add_reference_by_name(
316 &from_key,
317 base_class,
318 graph::EdgeType::Inherits,
319 );
320 }
321 }
322
323 for iface in &symbol.implements {
325 if let Some(targets) = index.get(iface) {
326 for target in targets {
327 if target != &from_key {
328 graph.add_reference(&from_key, target, graph::EdgeType::Implements);
329 }
330 }
331 } else {
332 graph.add_reference_by_name(&from_key, iface, graph::EdgeType::Implements);
333 }
334 }
335
336 for called_name in &symbol.calls {
338 if let Some(targets) = index.get(called_name) {
339 for target in targets {
340 if target != &from_key {
341 graph.add_reference(&from_key, target, graph::EdgeType::Calls);
342 }
343 }
344 } else {
345 graph.add_reference_by_name(&from_key, called_name, graph::EdgeType::Calls);
346 }
347 }
348 }
349 }
350 }
351
352 fn build_ranked_symbols_fast(
354 &self,
355 graph: &SymbolGraph,
356 ranks: &HashMap<String, f64>,
357 ) -> Vec<RankedSymbol> {
358 let top_nodes = graph.get_top_symbols_with_ranks(ranks, self.effective_max_symbols());
359
360 top_nodes
361 .iter()
362 .enumerate()
363 .map(|(i, node)| {
364 let key = format!("{}:{}", node.file_path, node.symbol.name);
365 let rank_score = ranks.get(&key).copied().unwrap_or(0.0);
366
367 RankedSymbol {
368 name: node.symbol.name.clone(),
369 kind: node.symbol.kind.name().to_owned(),
370 file: node.file_path.clone(),
371 line: node.symbol.start_line,
372 signature: node.symbol.signature.clone(),
373 summary: summarize_symbol(&node.symbol),
374 references: node.symbol.references,
375 rank: (i + 1) as u32,
376 importance: rank_score as f32,
377 }
378 })
379 .collect()
380 }
381
382 fn build_module_graph(&self, repo: &Repository) -> ModuleGraph {
383 let mut modules: HashMap<String, ModuleNode> = HashMap::new();
384 let mut edge_counts: HashMap<(String, String), u32> = HashMap::new();
385
386 for file in &repo.files {
388 let module = file
389 .relative_path
390 .split('/')
391 .next()
392 .unwrap_or("root")
393 .to_owned();
394
395 let entry = modules.entry(module.clone()).or_insert(ModuleNode {
396 name: module.clone(),
397 files: 0,
398 tokens: 0,
399 });
400
401 entry.files += 1;
402 entry.tokens += file.token_count.get(self.model);
403 }
404
405 let mut symbol_to_module: HashMap<String, String> = HashMap::new();
407 for file in &repo.files {
408 let module = file
409 .relative_path
410 .split('/')
411 .next()
412 .unwrap_or("root")
413 .to_owned();
414
415 for symbol in &file.symbols {
416 symbol_to_module.insert(symbol.name.clone(), module.clone());
417 }
418 }
419
420 for file in &repo.files {
422 let from_module = file
423 .relative_path
424 .split('/')
425 .next()
426 .unwrap_or("root")
427 .to_owned();
428
429 for symbol in &file.symbols {
430 if symbol.kind == SymbolKind::Import {
432 if let Some(target_module) = symbol_to_module.get(&symbol.name) {
433 if target_module != &from_module {
434 *edge_counts
435 .entry((from_module.clone(), target_module.clone()))
436 .or_insert(0) += 1;
437 }
438 }
439 }
440
441 for called_name in &symbol.calls {
443 if let Some(target_module) = symbol_to_module.get(called_name) {
444 if target_module != &from_module {
445 *edge_counts
446 .entry((from_module.clone(), target_module.clone()))
447 .or_insert(0) += 1;
448 }
449 }
450 }
451 }
452 }
453
454 let edges: Vec<ModuleEdge> = edge_counts
456 .into_iter()
457 .map(|((from, to), weight)| ModuleEdge { from, to, weight })
458 .collect();
459
460 ModuleGraph { nodes: modules.into_values().collect(), edges }
461 }
462
463 fn build_file_index(&self, repo: &Repository, symbol_count: usize) -> Vec<FileIndexEntry> {
464 let mut files: Vec<_> = repo
465 .files
466 .iter()
467 .map(|f| {
468 let importance = if f.importance > 0.8 {
469 "critical"
470 } else if f.importance > 0.6 {
471 "high"
472 } else if f.importance > 0.3 {
473 "normal"
474 } else {
475 "low"
476 };
477
478 FileIndexEntry {
479 path: f.relative_path.clone(),
480 tokens: f.token_count.get(self.model),
481 importance: importance.to_owned(),
482 summary: None,
483 }
484 })
485 .collect();
486
487 files.sort_by(|a, b| {
489 let a_imp = match a.importance.as_str() {
490 "critical" => 4,
491 "high" => 3,
492 "normal" => 2,
493 _ => 1,
494 };
495 let b_imp = match b.importance.as_str() {
496 "critical" => 4,
497 "high" => 3,
498 "normal" => 2,
499 _ => 1,
500 };
501 b_imp.cmp(&a_imp)
502 });
503
504 let symbol_tokens = symbol_count as u32 * TOKENS_PER_SYMBOL;
507 let remaining_budget = self
508 .token_budget
509 .saturating_sub(symbol_tokens)
510 .saturating_sub(TOKEN_OVERHEAD);
511 let max_files = (remaining_budget / TOKENS_PER_FILE) as usize;
512
513 if files.len() > max_files && max_files > 0 {
514 files.truncate(max_files);
515 }
516
517 files
518 }
519
520 fn generate_summary(&self, repo: &Repository, symbols: &[RankedSymbol]) -> String {
521 let mut seen = std::collections::HashSet::new();
523 let top_modules: Vec<_> = symbols
524 .iter()
525 .filter_map(|s| s.file.split('/').next())
526 .filter(|m| seen.insert(*m))
527 .take(3)
528 .collect();
529
530 let primary_lang = repo
531 .metadata
532 .languages
533 .first()
534 .map(|l| l.language.as_str())
535 .unwrap_or("unknown");
536
537 format!(
538 "Repository: {} ({} files, {} lines)\n\
539 Primary language: {}\n\
540 Key modules: {}",
541 repo.name,
542 repo.metadata.total_files,
543 repo.metadata.total_lines,
544 primary_lang,
545 top_modules.join(", ")
546 )
547 }
548
549 fn estimate_tokens(&self, symbols: &[RankedSymbol], files: &[FileIndexEntry]) -> u32 {
550 let symbol_tokens = symbols.len() as u32 * TOKENS_PER_SYMBOL;
552 let file_tokens = files.len() as u32 * TOKENS_PER_FILE;
553
554 symbol_tokens + file_tokens + TOKEN_OVERHEAD
555 }
556
557 fn strip_language_extension(path: &str) -> &str {
560 const EXTENSIONS: &[&str] = &[
562 ".gemspec",
564 ".fsscript",
565 ".tsx",
567 ".jsx",
568 ".mjs",
569 ".cjs",
570 ".ts",
571 ".js",
572 ".cpp",
574 ".cxx",
575 ".hpp",
576 ".hxx",
577 ".cc",
578 ".hh",
579 ".c",
580 ".h",
581 ".cs",
583 ".rb",
585 ".rake",
586 ".bash",
588 ".zsh",
589 ".fish",
590 ".sh",
591 ".phtml",
593 ".php3",
594 ".php4",
595 ".php5",
596 ".phps",
597 ".php",
598 ".kts",
600 ".kt",
601 ".swift",
603 ".scala",
605 ".sc",
606 ".lhs",
608 ".hs",
609 ".heex",
611 ".leex",
612 ".exs",
613 ".eex",
614 ".ex",
615 ".cljs",
617 ".cljc",
618 ".edn",
619 ".clj",
620 ".mli",
622 ".ml",
623 ".fsx",
625 ".fsi",
626 ".fs",
627 ".lua",
629 ".rmd",
631 ".r",
632 ".pyw",
634 ".py",
635 ".rs",
637 ".go",
639 ".java",
641 ];
642
643 for ext in EXTENSIONS {
644 if let Some(stripped) = path.strip_suffix(ext) {
645 return stripped;
646 }
647 }
648 path
649 }
650}
651
652#[cfg(test)]
653#[allow(clippy::str_to_string)]
654mod tests {
655 use super::*;
656 use crate::types::{RepoMetadata, TokenCounts};
657
658 fn create_test_repo() -> Repository {
659 Repository {
660 name: "test-repo".to_owned(),
661 path: "/tmp/test".into(),
662 files: vec![RepoFile {
663 path: "/tmp/test/src/main.py".into(),
664 relative_path: "src/main.py".to_string(),
665 language: Some("python".to_string()),
666 size_bytes: 1000,
667 token_count: TokenCounts {
668 o200k: 240,
669 cl100k: 245,
670 claude: 250,
671 gemini: 230,
672 llama: 235,
673 mistral: 235,
674 deepseek: 235,
675 qwen: 235,
676 cohere: 238,
677 grok: 235,
678 },
679 symbols: vec![Symbol {
680 name: "main".to_string(),
681 kind: SymbolKind::Function,
682 signature: Some("def main() -> None".to_string()),
683 docstring: Some("Entry point".to_string()),
684 start_line: 10,
685 end_line: 25,
686 references: 5,
687 importance: 0.9,
688 parent: None,
689 visibility: crate::types::Visibility::Public,
690 calls: vec!["helper".to_string(), "process".to_string()],
691 extends: None,
692 implements: vec![],
693 }],
694 importance: 0.9,
695 content: None,
696 }],
697 metadata: RepoMetadata {
698 total_files: 1,
699 total_lines: 100,
700 total_tokens: TokenCounts {
701 o200k: 240,
702 cl100k: 245,
703 claude: 250,
704 gemini: 230,
705 llama: 235,
706 mistral: 235,
707 deepseek: 235,
708 qwen: 235,
709 cohere: 238,
710 grok: 235,
711 },
712 languages: vec![crate::types::LanguageStats {
713 language: "Python".to_string(),
714 files: 1,
715 lines: 100,
716 percentage: 100.0,
717 }],
718 framework: None,
719 description: None,
720 branch: None,
721 commit: None,
722 directory_structure: None,
723 external_dependencies: vec![],
724 git_history: None,
725 },
726 }
727 }
728
729 #[test]
730 fn test_generate_repomap() {
731 let repo = create_test_repo();
732 let generator = RepoMapGenerator::new(2000);
733 let map = generator.generate(&repo);
734
735 assert!(!map.summary.is_empty());
736 assert!(!map.file_index.is_empty());
737 }
738}