1mod graph;
4
5const TOKENS_PER_SYMBOL: u32 = 25;
15
16const TOKENS_PER_FILE: u32 = 10;
18
19const TOKEN_OVERHEAD: u32 = 100;
21
22const BUDGET_SYMBOL_DIVISOR: usize = 20;
26const SUMMARY_MAX_LEN: usize = 120;
27
28fn summarize_symbol(symbol: &Symbol) -> Option<String> {
29 let line = symbol
30 .docstring
31 .as_deref()
32 .and_then(first_nonempty_line)
33 .or_else(|| symbol.signature.as_deref().and_then(first_nonempty_line));
34
35 line.map(|text| truncate_summary(text, SUMMARY_MAX_LEN))
36}
37
38fn first_nonempty_line(text: &str) -> Option<&str> {
39 text.lines()
40 .map(|line| line.trim())
41 .find(|line| !line.is_empty())
42}
43
44fn truncate_summary(text: &str, max_len: usize) -> String {
45 if text.chars().count() <= max_len {
46 return text.to_owned();
47 }
48
49 let mut truncated: String = text.chars().take(max_len).collect();
50 truncated = truncated.trim_end().to_owned();
51 truncated.push_str("...");
52 truncated
53}
54
55#[cfg(test)]
56use crate::types::RepoFile;
57use crate::types::{Repository, Symbol, SymbolKind, TokenizerModel};
58use graph::SymbolGraph;
59use serde::Serialize;
60use std::collections::HashMap;
61use std::fmt;
62use typed_builder::TypedBuilder;
63
64#[derive(Debug, Clone, Serialize)]
66pub struct RepoMap {
67 pub summary: String,
69 pub key_symbols: Vec<RankedSymbol>,
71 pub module_graph: ModuleGraph,
73 pub file_index: Vec<FileIndexEntry>,
75 pub token_count: u32,
77}
78
79impl fmt::Display for RepoMap {
80 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
81 write!(
82 f,
83 "RepoMap({} symbols, {} files, {} tokens)",
84 self.key_symbols.len(),
85 self.file_index.len(),
86 self.token_count
87 )
88 }
89}
90
91#[derive(Debug, Clone, Serialize)]
93pub struct RankedSymbol {
94 pub name: String,
96 pub kind: String,
98 pub file: String,
100 pub line: u32,
102 pub signature: Option<String>,
104 pub summary: Option<String>,
106 pub references: u32,
108 pub rank: u32,
110 pub importance: f32,
112}
113
114#[derive(Debug, Clone, Serialize)]
116pub struct ModuleGraph {
117 pub nodes: Vec<ModuleNode>,
119 pub edges: Vec<ModuleEdge>,
121}
122
123#[derive(Debug, Clone, Serialize)]
125pub struct ModuleNode {
126 pub name: String,
128 pub files: u32,
130 pub tokens: u32,
132}
133
134#[derive(Debug, Clone, Serialize)]
136pub struct ModuleEdge {
137 pub from: String,
139 pub to: String,
141 pub weight: u32,
143}
144
145#[derive(Debug, Clone, Serialize)]
147pub struct FileIndexEntry {
148 pub path: String,
150 pub tokens: u32,
152 pub importance: String,
154 pub summary: Option<String>,
156}
157
158#[derive(Debug, Clone, TypedBuilder)]
175pub struct RepoMapGenerator {
176 #[builder(default, setter(strip_option))]
178 max_symbols: Option<usize>,
179
180 #[builder(default = TokenizerModel::Claude)]
182 model: TokenizerModel,
183
184 #[builder(default = 2000)]
186 token_budget: u32,
187}
188
189impl RepoMapGenerator {
190 pub fn new(token_budget: u32) -> Self {
194 Self { max_symbols: None, model: TokenizerModel::Claude, token_budget }
195 }
196
197 fn effective_max_symbols(&self) -> usize {
201 self.max_symbols.unwrap_or_else(|| {
202 ((self.token_budget as usize) / BUDGET_SYMBOL_DIVISOR).clamp(5, 500)
205 })
206 }
207
208 #[must_use = "repository map should be used or formatted"]
218 pub fn generate(&self, repo: &Repository) -> RepoMap {
219 let mut graph = SymbolGraph::new();
221 for file in &repo.files {
222 graph.add_file(file);
223 }
224
225 let symbol_index = self.build_symbol_index(repo);
227
228 self.extract_references_fast(&mut graph, repo, &symbol_index);
230
231 let ranks = graph.compute_pagerank(0.85, 20); let key_symbols = self.build_ranked_symbols_fast(&graph, &ranks);
236
237 let module_graph = self.build_module_graph(repo);
239
240 let file_index = self.build_file_index(repo, key_symbols.len());
242
243 let summary = self.generate_summary(repo, &key_symbols);
245
246 let token_count = self.estimate_tokens(&key_symbols, &file_index);
248
249 RepoMap { summary, key_symbols, module_graph, file_index, token_count }
250 }
251
252 fn build_symbol_index(&self, repo: &Repository) -> HashMap<String, Vec<String>> {
257 let mut index: HashMap<String, Vec<String>> = HashMap::new();
258 for file in &repo.files {
259 let path_key = Self::strip_language_extension(&file.relative_path);
261
262 for symbol in &file.symbols {
263 let symbol_key = format!("{}:{}", file.relative_path, symbol.name);
264
265 index
267 .entry(symbol.name.clone())
268 .or_default()
269 .push(symbol_key.clone());
270
271 index
274 .entry(path_key.to_owned())
275 .or_default()
276 .push(symbol_key);
277 }
278 }
279 index
280 }
281
282 fn extract_references_fast(
287 &self,
288 graph: &mut SymbolGraph,
289 repo: &Repository,
290 index: &HashMap<String, Vec<String>>,
291 ) {
292 for file in &repo.files {
293 for symbol in &file.symbols {
294 let from_key = format!("{}:{}", file.relative_path, symbol.name);
295
296 if symbol.kind == SymbolKind::Import {
298 if let Some(targets) = index.get(&symbol.name) {
300 for target in targets {
301 if target != &from_key {
303 graph.add_reference(&from_key, target, graph::EdgeType::Imports);
304 }
305 }
306 }
307 }
308
309 if let Some(ref base_class) = symbol.extends {
311 if let Some(targets) = index.get(base_class) {
313 for target in targets {
314 if target != &from_key {
315 graph.add_reference(&from_key, target, graph::EdgeType::Inherits);
316 }
317 }
318 } else {
319 graph.add_reference_by_name(
320 &from_key,
321 base_class,
322 graph::EdgeType::Inherits,
323 );
324 }
325 }
326
327 for iface in &symbol.implements {
329 if let Some(targets) = index.get(iface) {
330 for target in targets {
331 if target != &from_key {
332 graph.add_reference(&from_key, target, graph::EdgeType::Implements);
333 }
334 }
335 } else {
336 graph.add_reference_by_name(&from_key, iface, graph::EdgeType::Implements);
337 }
338 }
339
340 for called_name in &symbol.calls {
342 if let Some(targets) = index.get(called_name) {
343 for target in targets {
344 if target != &from_key {
345 graph.add_reference(&from_key, target, graph::EdgeType::Calls);
346 }
347 }
348 } else {
349 graph.add_reference_by_name(&from_key, called_name, graph::EdgeType::Calls);
350 }
351 }
352 }
353 }
354 }
355
356 fn build_ranked_symbols_fast(
358 &self,
359 graph: &SymbolGraph,
360 ranks: &HashMap<String, f64>,
361 ) -> Vec<RankedSymbol> {
362 let top_nodes = graph.get_top_symbols_with_ranks(ranks, self.effective_max_symbols());
363
364 top_nodes
365 .iter()
366 .enumerate()
367 .map(|(i, node)| {
368 let key = format!("{}:{}", node.file_path, node.symbol.name);
369 let rank_score = ranks.get(&key).copied().unwrap_or(0.0);
370
371 RankedSymbol {
372 name: node.symbol.name.clone(),
373 kind: node.symbol.kind.name().to_owned(),
374 file: node.file_path.clone(),
375 line: node.symbol.start_line,
376 signature: node.symbol.signature.clone(),
377 summary: summarize_symbol(&node.symbol),
378 references: node.symbol.references,
379 rank: (i + 1) as u32,
380 importance: rank_score as f32,
381 }
382 })
383 .collect()
384 }
385
386 fn build_module_graph(&self, repo: &Repository) -> ModuleGraph {
387 let mut modules: HashMap<String, ModuleNode> = HashMap::new();
388 let mut edge_counts: HashMap<(String, String), u32> = HashMap::new();
389
390 for file in &repo.files {
392 let module = file
393 .relative_path
394 .split('/')
395 .next()
396 .unwrap_or("root")
397 .to_owned();
398
399 let entry = modules.entry(module.clone()).or_insert(ModuleNode {
400 name: module.clone(),
401 files: 0,
402 tokens: 0,
403 });
404
405 entry.files += 1;
406 entry.tokens += file.token_count.get(self.model);
407 }
408
409 let mut symbol_to_module: HashMap<String, String> = HashMap::new();
411 for file in &repo.files {
412 let module = file
413 .relative_path
414 .split('/')
415 .next()
416 .unwrap_or("root")
417 .to_owned();
418
419 for symbol in &file.symbols {
420 symbol_to_module.insert(symbol.name.clone(), module.clone());
421 }
422 }
423
424 for file in &repo.files {
426 let from_module = file
427 .relative_path
428 .split('/')
429 .next()
430 .unwrap_or("root")
431 .to_owned();
432
433 for symbol in &file.symbols {
434 if symbol.kind == SymbolKind::Import {
436 if let Some(target_module) = symbol_to_module.get(&symbol.name) {
437 if target_module != &from_module {
438 *edge_counts
439 .entry((from_module.clone(), target_module.clone()))
440 .or_insert(0) += 1;
441 }
442 }
443 }
444
445 for called_name in &symbol.calls {
447 if let Some(target_module) = symbol_to_module.get(called_name) {
448 if target_module != &from_module {
449 *edge_counts
450 .entry((from_module.clone(), target_module.clone()))
451 .or_insert(0) += 1;
452 }
453 }
454 }
455 }
456 }
457
458 let edges: Vec<ModuleEdge> = edge_counts
460 .into_iter()
461 .map(|((from, to), weight)| ModuleEdge { from, to, weight })
462 .collect();
463
464 ModuleGraph { nodes: modules.into_values().collect(), edges }
465 }
466
467 fn build_file_index(&self, repo: &Repository, symbol_count: usize) -> Vec<FileIndexEntry> {
468 let mut files: Vec<_> = repo
469 .files
470 .iter()
471 .map(|f| {
472 let importance = if f.importance > 0.8 {
473 "critical"
474 } else if f.importance > 0.6 {
475 "high"
476 } else if f.importance > 0.3 {
477 "normal"
478 } else {
479 "low"
480 };
481
482 FileIndexEntry {
483 path: f.relative_path.clone(),
484 tokens: f.token_count.get(self.model),
485 importance: importance.to_owned(),
486 summary: None,
487 }
488 })
489 .collect();
490
491 files.sort_by(|a, b| {
493 let a_imp = match a.importance.as_str() {
494 "critical" => 4,
495 "high" => 3,
496 "normal" => 2,
497 _ => 1,
498 };
499 let b_imp = match b.importance.as_str() {
500 "critical" => 4,
501 "high" => 3,
502 "normal" => 2,
503 _ => 1,
504 };
505 b_imp.cmp(&a_imp)
506 });
507
508 let symbol_tokens = symbol_count as u32 * TOKENS_PER_SYMBOL;
511 let remaining_budget = self
512 .token_budget
513 .saturating_sub(symbol_tokens)
514 .saturating_sub(TOKEN_OVERHEAD);
515 let max_files = (remaining_budget / TOKENS_PER_FILE) as usize;
516
517 if files.len() > max_files && max_files > 0 {
518 files.truncate(max_files);
519 }
520
521 files
522 }
523
524 fn generate_summary(&self, repo: &Repository, symbols: &[RankedSymbol]) -> String {
525 let mut seen = std::collections::HashSet::new();
527 let top_modules: Vec<_> = symbols
528 .iter()
529 .filter_map(|s| s.file.split('/').next())
530 .filter(|m| seen.insert(*m))
531 .take(3)
532 .collect();
533
534 let primary_lang = repo
535 .metadata
536 .languages
537 .first()
538 .map(|l| l.language.as_str())
539 .unwrap_or("unknown");
540
541 format!(
542 "Repository: {} ({} files, {} lines)\n\
543 Primary language: {}\n\
544 Key modules: {}",
545 repo.name,
546 repo.metadata.total_files,
547 repo.metadata.total_lines,
548 primary_lang,
549 top_modules.join(", ")
550 )
551 }
552
553 fn estimate_tokens(&self, symbols: &[RankedSymbol], files: &[FileIndexEntry]) -> u32 {
554 let symbol_tokens = symbols.len() as u32 * TOKENS_PER_SYMBOL;
556 let file_tokens = files.len() as u32 * TOKENS_PER_FILE;
557
558 symbol_tokens + file_tokens + TOKEN_OVERHEAD
559 }
560
561 fn strip_language_extension(path: &str) -> &str {
564 const EXTENSIONS: &[&str] = &[
566 ".gemspec",
568 ".fsscript",
569 ".tsx",
571 ".jsx",
572 ".mjs",
573 ".cjs",
574 ".ts",
575 ".js",
576 ".cpp",
578 ".cxx",
579 ".hpp",
580 ".hxx",
581 ".cc",
582 ".hh",
583 ".c",
584 ".h",
585 ".cs",
587 ".rb",
589 ".rake",
590 ".bash",
592 ".zsh",
593 ".fish",
594 ".sh",
595 ".phtml",
597 ".php3",
598 ".php4",
599 ".php5",
600 ".phps",
601 ".php",
602 ".kts",
604 ".kt",
605 ".swift",
607 ".scala",
609 ".sc",
610 ".lhs",
612 ".hs",
613 ".heex",
615 ".leex",
616 ".exs",
617 ".eex",
618 ".ex",
619 ".cljs",
621 ".cljc",
622 ".edn",
623 ".clj",
624 ".mli",
626 ".ml",
627 ".fsx",
629 ".fsi",
630 ".fs",
631 ".lua",
633 ".rmd",
635 ".r",
636 ".pyw",
638 ".py",
639 ".rs",
641 ".go",
643 ".java",
645 ];
646
647 for ext in EXTENSIONS {
648 if let Some(stripped) = path.strip_suffix(ext) {
649 return stripped;
650 }
651 }
652 path
653 }
654}
655
656#[cfg(test)]
657#[allow(clippy::str_to_string)]
658mod tests {
659 use super::*;
660 use crate::types::{RepoMetadata, TokenCounts};
661
662 fn create_test_repo() -> Repository {
663 Repository {
664 name: "test-repo".to_owned(),
665 path: "/tmp/test".into(),
666 files: vec![RepoFile {
667 path: "/tmp/test/src/main.py".into(),
668 relative_path: "src/main.py".to_string(),
669 language: Some("python".to_string()),
670 size_bytes: 1000,
671 token_count: TokenCounts {
672 o200k: 240,
673 cl100k: 245,
674 claude: 250,
675 gemini: 230,
676 llama: 235,
677 mistral: 235,
678 deepseek: 235,
679 qwen: 235,
680 cohere: 238,
681 grok: 235,
682 },
683 symbols: vec![Symbol {
684 name: "main".to_string(),
685 kind: SymbolKind::Function,
686 signature: Some("def main() -> None".to_string()),
687 docstring: Some("Entry point".to_string()),
688 start_line: 10,
689 end_line: 25,
690 references: 5,
691 importance: 0.9,
692 parent: None,
693 visibility: crate::types::Visibility::Public,
694 calls: vec!["helper".to_string(), "process".to_string()],
695 extends: None,
696 implements: vec![],
697 }],
698 importance: 0.9,
699 content: None,
700 }],
701 metadata: RepoMetadata {
702 total_files: 1,
703 total_lines: 100,
704 total_tokens: TokenCounts {
705 o200k: 240,
706 cl100k: 245,
707 claude: 250,
708 gemini: 230,
709 llama: 235,
710 mistral: 235,
711 deepseek: 235,
712 qwen: 235,
713 cohere: 238,
714 grok: 235,
715 },
716 languages: vec![crate::types::LanguageStats {
717 language: "Python".to_string(),
718 files: 1,
719 lines: 100,
720 percentage: 100.0,
721 }],
722 framework: None,
723 description: None,
724 branch: None,
725 commit: None,
726 directory_structure: None,
727 external_dependencies: vec![],
728 git_history: None,
729 },
730 }
731 }
732
733 #[test]
734 fn test_generate_repomap() {
735 let repo = create_test_repo();
736 let generator = RepoMapGenerator::new(2000);
737 let map = generator.generate(&repo);
738
739 assert!(!map.summary.is_empty());
740 assert!(!map.file_index.is_empty());
741 }
742
743 #[test]
745 fn test_map_budget_affects_output() {
746 let small = RepoMapGenerator::new(500);
748 let medium = RepoMapGenerator::new(2000);
749 let large = RepoMapGenerator::new(10000);
750
751 let small_max = small.effective_max_symbols();
752 let medium_max = medium.effective_max_symbols();
753 let large_max = large.effective_max_symbols();
754
755 assert!(
757 small_max < medium_max,
758 "Small budget ({}) should have fewer symbols ({}) than medium ({}) with ({})",
759 500,
760 small_max,
761 2000,
762 medium_max
763 );
764
765 assert!(
767 medium_max < large_max,
768 "Medium budget ({}) should have fewer symbols ({}) than large ({}) with ({})",
769 2000,
770 medium_max,
771 10000,
772 large_max
773 );
774
775 assert!(small_max >= 20 && small_max <= 30, "Small max_symbols should be ~25, got {}", small_max);
781 assert!(medium_max >= 90 && medium_max <= 110, "Medium max_symbols should be ~100, got {}", medium_max);
782 assert!(large_max >= 400 && large_max <= 500, "Large max_symbols should be ~500, got {}", large_max);
783 }
784
785 #[test]
786 fn test_map_budget_min_max_clamped() {
787 let tiny = RepoMapGenerator::new(10);
789 assert_eq!(tiny.effective_max_symbols(), 5);
790
791 let huge = RepoMapGenerator::new(100_000);
793 assert_eq!(huge.effective_max_symbols(), 500);
794 }
795}