1mod graph;
4
5use crate::constants::repomap as repomap_consts;
6
7fn summarize_symbol(symbol: &Symbol) -> Option<String> {
8 let line = symbol
9 .docstring
10 .as_deref()
11 .and_then(first_nonempty_line)
12 .or_else(|| symbol.signature.as_deref().and_then(first_nonempty_line));
13
14 line.map(|text| truncate_summary(text, repomap_consts::SUMMARY_MAX_LEN))
15}
16
17fn first_nonempty_line(text: &str) -> Option<&str> {
18 text.lines()
19 .map(|line| line.trim())
20 .find(|line| !line.is_empty())
21}
22
23fn truncate_summary(text: &str, max_len: usize) -> String {
24 if text.chars().count() <= max_len {
25 return text.to_owned();
26 }
27
28 let mut truncated: String = text.chars().take(max_len).collect();
29 truncated = truncated.trim_end().to_owned();
30 truncated.push_str("...");
31 truncated
32}
33
34#[cfg(test)]
35use crate::types::RepoFile;
36use crate::types::{Repository, Symbol, SymbolKind, TokenizerModel};
37use graph::SymbolGraph;
38use serde::Serialize;
39use std::collections::HashMap;
40use std::fmt;
41use typed_builder::TypedBuilder;
42
43#[derive(Debug, Clone, Serialize)]
45pub struct RepoMap {
46 pub summary: String,
48 pub key_symbols: Vec<RankedSymbol>,
50 pub module_graph: ModuleGraph,
52 pub file_index: Vec<FileIndexEntry>,
54 pub token_count: u32,
56}
57
58impl fmt::Display for RepoMap {
59 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
60 write!(
61 f,
62 "RepoMap({} symbols, {} files, {} tokens)",
63 self.key_symbols.len(),
64 self.file_index.len(),
65 self.token_count
66 )
67 }
68}
69
70#[derive(Debug, Clone, Serialize)]
72pub struct RankedSymbol {
73 pub name: String,
75 pub kind: String,
77 pub file: String,
79 pub line: u32,
81 pub signature: Option<String>,
83 pub summary: Option<String>,
85 pub references: u32,
87 pub rank: u32,
89 pub importance: f32,
91}
92
93#[derive(Debug, Clone, Serialize)]
95pub struct ModuleGraph {
96 pub nodes: Vec<ModuleNode>,
98 pub edges: Vec<ModuleEdge>,
100}
101
102#[derive(Debug, Clone, Serialize)]
104pub struct ModuleNode {
105 pub name: String,
107 pub files: u32,
109 pub tokens: u32,
111}
112
113#[derive(Debug, Clone, Serialize)]
115pub struct ModuleEdge {
116 pub from: String,
118 pub to: String,
120 pub weight: u32,
122}
123
124#[derive(Debug, Clone, Serialize)]
126pub struct FileIndexEntry {
127 pub path: String,
129 pub tokens: u32,
131 pub importance: String,
133 pub summary: Option<String>,
135}
136
137#[derive(Debug, Clone, TypedBuilder)]
154pub struct RepoMapGenerator {
155 #[builder(default, setter(strip_option))]
157 max_symbols: Option<usize>,
158
159 #[builder(default = TokenizerModel::Claude)]
161 model: TokenizerModel,
162
163 #[builder(default = 2000)]
165 token_budget: u32,
166}
167
168impl RepoMapGenerator {
169 pub fn new(token_budget: u32) -> Self {
173 Self { max_symbols: None, model: TokenizerModel::Claude, token_budget }
174 }
175
176 fn effective_max_symbols(&self) -> usize {
180 self.max_symbols.unwrap_or_else(|| {
181 ((self.token_budget as usize) / repomap_consts::BUDGET_SYMBOL_DIVISOR)
184 .clamp(repomap_consts::MIN_SYMBOLS, repomap_consts::MAX_SYMBOLS)
185 })
186 }
187
188 #[must_use = "repository map should be used or formatted"]
198 pub fn generate(&self, repo: &Repository) -> RepoMap {
199 let mut graph = SymbolGraph::new();
201 for file in &repo.files {
202 graph.add_file(file);
203 }
204
205 let symbol_index = self.build_symbol_index(repo);
207
208 self.extract_references_fast(&mut graph, repo, &symbol_index);
210
211 let ranks = graph.compute_pagerank(0.85, 20); let key_symbols = self.build_ranked_symbols_fast(&graph, &ranks);
216
217 let module_graph = self.build_module_graph(repo);
219
220 let file_index = self.build_file_index(repo, key_symbols.len());
222
223 let summary = self.generate_summary(repo, &key_symbols);
225
226 let token_count = self.estimate_tokens(&key_symbols, &file_index);
228
229 RepoMap { summary, key_symbols, module_graph, file_index, token_count }
230 }
231
232 fn build_symbol_index(&self, repo: &Repository) -> HashMap<String, Vec<String>> {
237 let mut index: HashMap<String, Vec<String>> = HashMap::new();
238 for file in &repo.files {
239 let path_key = Self::strip_language_extension(&file.relative_path);
241
242 for symbol in &file.symbols {
243 let symbol_key = format!("{}:{}", file.relative_path, symbol.name);
244
245 index
247 .entry(symbol.name.clone())
248 .or_default()
249 .push(symbol_key.clone());
250
251 index
254 .entry(path_key.to_owned())
255 .or_default()
256 .push(symbol_key);
257 }
258 }
259 index
260 }
261
262 fn extract_references_fast(
267 &self,
268 graph: &mut SymbolGraph,
269 repo: &Repository,
270 index: &HashMap<String, Vec<String>>,
271 ) {
272 for file in &repo.files {
273 for symbol in &file.symbols {
274 let from_key = format!("{}:{}", file.relative_path, symbol.name);
275
276 if symbol.kind == SymbolKind::Import {
278 if let Some(targets) = index.get(&symbol.name) {
280 for target in targets {
281 if target != &from_key {
283 graph.add_reference(&from_key, target, graph::EdgeType::Imports);
284 }
285 }
286 }
287 }
288
289 if let Some(ref base_class) = symbol.extends {
291 if let Some(targets) = index.get(base_class) {
293 for target in targets {
294 if target != &from_key {
295 graph.add_reference(&from_key, target, graph::EdgeType::Inherits);
296 }
297 }
298 } else {
299 graph.add_reference_by_name(
300 &from_key,
301 base_class,
302 graph::EdgeType::Inherits,
303 );
304 }
305 }
306
307 for iface in &symbol.implements {
309 if let Some(targets) = index.get(iface) {
310 for target in targets {
311 if target != &from_key {
312 graph.add_reference(&from_key, target, graph::EdgeType::Implements);
313 }
314 }
315 } else {
316 graph.add_reference_by_name(&from_key, iface, graph::EdgeType::Implements);
317 }
318 }
319
320 for called_name in &symbol.calls {
322 if let Some(targets) = index.get(called_name) {
323 for target in targets {
324 if target != &from_key {
325 graph.add_reference(&from_key, target, graph::EdgeType::Calls);
326 }
327 }
328 } else {
329 graph.add_reference_by_name(&from_key, called_name, graph::EdgeType::Calls);
330 }
331 }
332 }
333 }
334 }
335
336 fn build_ranked_symbols_fast(
338 &self,
339 graph: &SymbolGraph,
340 ranks: &HashMap<String, f64>,
341 ) -> Vec<RankedSymbol> {
342 let top_nodes = graph.get_top_symbols_with_ranks(ranks, self.effective_max_symbols());
343
344 top_nodes
345 .iter()
346 .enumerate()
347 .map(|(i, node)| {
348 let key = format!("{}:{}", node.file_path, node.symbol.name);
349 let rank_score = ranks.get(&key).copied().unwrap_or(0.0);
350
351 RankedSymbol {
352 name: node.symbol.name.clone(),
353 kind: node.symbol.kind.name().to_owned(),
354 file: node.file_path.clone(),
355 line: node.symbol.start_line,
356 signature: node.symbol.signature.clone(),
357 summary: summarize_symbol(&node.symbol),
358 references: node.symbol.references,
359 rank: (i + 1) as u32,
360 importance: rank_score as f32,
361 }
362 })
363 .collect()
364 }
365
366 fn build_module_graph(&self, repo: &Repository) -> ModuleGraph {
367 let mut modules: HashMap<String, ModuleNode> = HashMap::new();
368 let mut edge_counts: HashMap<(String, String), u32> = HashMap::new();
369
370 for file in &repo.files {
372 let module = file
373 .relative_path
374 .split('/')
375 .next()
376 .unwrap_or("root")
377 .to_owned();
378
379 let entry = modules.entry(module.clone()).or_insert(ModuleNode {
380 name: module.clone(),
381 files: 0,
382 tokens: 0,
383 });
384
385 entry.files += 1;
386 entry.tokens += file.token_count.get(self.model);
387 }
388
389 let mut symbol_to_module: HashMap<String, String> = HashMap::new();
391 for file in &repo.files {
392 let module = file
393 .relative_path
394 .split('/')
395 .next()
396 .unwrap_or("root")
397 .to_owned();
398
399 for symbol in &file.symbols {
400 symbol_to_module.insert(symbol.name.clone(), module.clone());
401 }
402 }
403
404 for file in &repo.files {
406 let from_module = file
407 .relative_path
408 .split('/')
409 .next()
410 .unwrap_or("root")
411 .to_owned();
412
413 for symbol in &file.symbols {
414 if symbol.kind == SymbolKind::Import {
416 if let Some(target_module) = symbol_to_module.get(&symbol.name) {
417 if target_module != &from_module {
418 *edge_counts
419 .entry((from_module.clone(), target_module.clone()))
420 .or_insert(0) += 1;
421 }
422 }
423 }
424
425 for called_name in &symbol.calls {
427 if let Some(target_module) = symbol_to_module.get(called_name) {
428 if target_module != &from_module {
429 *edge_counts
430 .entry((from_module.clone(), target_module.clone()))
431 .or_insert(0) += 1;
432 }
433 }
434 }
435 }
436 }
437
438 let edges: Vec<ModuleEdge> = edge_counts
440 .into_iter()
441 .map(|((from, to), weight)| ModuleEdge { from, to, weight })
442 .collect();
443
444 ModuleGraph { nodes: modules.into_values().collect(), edges }
445 }
446
447 fn build_file_index(&self, repo: &Repository, symbol_count: usize) -> Vec<FileIndexEntry> {
448 let mut files: Vec<_> = repo
449 .files
450 .iter()
451 .map(|f| {
452 let importance = if f.importance > 0.8 {
453 "critical"
454 } else if f.importance > 0.6 {
455 "high"
456 } else if f.importance > 0.3 {
457 "normal"
458 } else {
459 "low"
460 };
461
462 FileIndexEntry {
463 path: f.relative_path.clone(),
464 tokens: f.token_count.get(self.model),
465 importance: importance.to_owned(),
466 summary: None,
467 }
468 })
469 .collect();
470
471 files.sort_by(|a, b| {
473 let a_imp = match a.importance.as_str() {
474 "critical" => 4,
475 "high" => 3,
476 "normal" => 2,
477 _ => 1,
478 };
479 let b_imp = match b.importance.as_str() {
480 "critical" => 4,
481 "high" => 3,
482 "normal" => 2,
483 _ => 1,
484 };
485 b_imp.cmp(&a_imp)
486 });
487
488 let symbol_tokens = symbol_count as u32 * repomap_consts::TOKENS_PER_SYMBOL;
491 let remaining_budget = self
492 .token_budget
493 .saturating_sub(symbol_tokens)
494 .saturating_sub(repomap_consts::TOKEN_OVERHEAD);
495 let max_files = (remaining_budget / repomap_consts::TOKENS_PER_FILE) as usize;
496
497 if files.len() > max_files && max_files > 0 {
498 files.truncate(max_files);
499 }
500
501 files
502 }
503
504 fn generate_summary(&self, repo: &Repository, symbols: &[RankedSymbol]) -> String {
505 let mut seen = std::collections::HashSet::new();
507 let top_modules: Vec<_> = symbols
508 .iter()
509 .filter_map(|s| s.file.split('/').next())
510 .filter(|m| seen.insert(*m))
511 .take(3)
512 .collect();
513
514 let primary_lang = repo
515 .metadata
516 .languages
517 .first()
518 .map(|l| l.language.as_str())
519 .unwrap_or("unknown");
520
521 format!(
522 "Repository: {} ({} files, {} lines)\n\
523 Primary language: {}\n\
524 Key modules: {}",
525 repo.name,
526 repo.metadata.total_files,
527 repo.metadata.total_lines,
528 primary_lang,
529 top_modules.join(", ")
530 )
531 }
532
533 fn estimate_tokens(&self, symbols: &[RankedSymbol], files: &[FileIndexEntry]) -> u32 {
534 let symbol_tokens = symbols.len() as u32 * repomap_consts::TOKENS_PER_SYMBOL;
536 let file_tokens = files.len() as u32 * repomap_consts::TOKENS_PER_FILE;
537
538 symbol_tokens + file_tokens + repomap_consts::TOKEN_OVERHEAD
539 }
540
541 fn strip_language_extension(path: &str) -> &str {
544 const EXTENSIONS: &[&str] = &[
546 ".gemspec",
548 ".fsscript",
549 ".tsx",
551 ".jsx",
552 ".mjs",
553 ".cjs",
554 ".ts",
555 ".js",
556 ".cpp",
558 ".cxx",
559 ".hpp",
560 ".hxx",
561 ".cc",
562 ".hh",
563 ".c",
564 ".h",
565 ".cs",
567 ".rb",
569 ".rake",
570 ".bash",
572 ".zsh",
573 ".fish",
574 ".sh",
575 ".phtml",
577 ".php3",
578 ".php4",
579 ".php5",
580 ".phps",
581 ".php",
582 ".kts",
584 ".kt",
585 ".swift",
587 ".scala",
589 ".sc",
590 ".lhs",
592 ".hs",
593 ".heex",
595 ".leex",
596 ".exs",
597 ".eex",
598 ".ex",
599 ".cljs",
601 ".cljc",
602 ".edn",
603 ".clj",
604 ".mli",
606 ".ml",
607 ".fsx",
609 ".fsi",
610 ".fs",
611 ".lua",
613 ".rmd",
615 ".r",
616 ".pyw",
618 ".py",
619 ".rs",
621 ".go",
623 ".java",
625 ];
626
627 for ext in EXTENSIONS {
628 if let Some(stripped) = path.strip_suffix(ext) {
629 return stripped;
630 }
631 }
632 path
633 }
634}
635
636#[cfg(test)]
637#[allow(clippy::str_to_string)]
638mod tests {
639 use super::*;
640 use crate::types::{RepoMetadata, TokenCounts};
641
642 fn create_test_repo() -> Repository {
643 Repository {
644 name: "test-repo".to_owned(),
645 path: "/tmp/test".into(),
646 files: vec![RepoFile {
647 path: "/tmp/test/src/main.py".into(),
648 relative_path: "src/main.py".to_string(),
649 language: Some("python".to_string()),
650 size_bytes: 1000,
651 token_count: TokenCounts {
652 o200k: 240,
653 cl100k: 245,
654 claude: 250,
655 gemini: 230,
656 llama: 235,
657 mistral: 235,
658 deepseek: 235,
659 qwen: 235,
660 cohere: 238,
661 grok: 235,
662 },
663 symbols: vec![Symbol {
664 name: "main".to_string(),
665 kind: SymbolKind::Function,
666 signature: Some("def main() -> None".to_string()),
667 docstring: Some("Entry point".to_string()),
668 start_line: 10,
669 end_line: 25,
670 references: 5,
671 importance: 0.9,
672 parent: None,
673 visibility: crate::types::Visibility::Public,
674 calls: vec!["helper".to_string(), "process".to_string()],
675 extends: None,
676 implements: vec![],
677 }],
678 importance: 0.9,
679 content: None,
680 }],
681 metadata: RepoMetadata {
682 total_files: 1,
683 total_lines: 100,
684 total_tokens: TokenCounts {
685 o200k: 240,
686 cl100k: 245,
687 claude: 250,
688 gemini: 230,
689 llama: 235,
690 mistral: 235,
691 deepseek: 235,
692 qwen: 235,
693 cohere: 238,
694 grok: 235,
695 },
696 languages: vec![crate::types::LanguageStats {
697 language: "Python".to_string(),
698 files: 1,
699 lines: 100,
700 percentage: 100.0,
701 }],
702 framework: None,
703 description: None,
704 branch: None,
705 commit: None,
706 directory_structure: None,
707 external_dependencies: vec![],
708 git_history: None,
709 },
710 }
711 }
712
713 #[test]
714 fn test_generate_repomap() {
715 let repo = create_test_repo();
716 let generator = RepoMapGenerator::new(2000);
717 let map = generator.generate(&repo);
718
719 assert!(!map.summary.is_empty());
720 assert!(!map.file_index.is_empty());
721 }
722
723 #[test]
725 fn test_map_budget_affects_output() {
726 let small = RepoMapGenerator::new(500);
728 let medium = RepoMapGenerator::new(2000);
729 let large = RepoMapGenerator::new(10000);
730
731 let small_max = small.effective_max_symbols();
732 let medium_max = medium.effective_max_symbols();
733 let large_max = large.effective_max_symbols();
734
735 assert!(
737 small_max < medium_max,
738 "Small budget ({}) should have fewer symbols ({}) than medium ({}) with ({})",
739 500,
740 small_max,
741 2000,
742 medium_max
743 );
744
745 assert!(
747 medium_max < large_max,
748 "Medium budget ({}) should have fewer symbols ({}) than large ({}) with ({})",
749 2000,
750 medium_max,
751 10000,
752 large_max
753 );
754
755 assert!(
761 (20..=30).contains(&small_max),
762 "Small max_symbols should be ~25, got {}",
763 small_max
764 );
765 assert!(
766 (90..=110).contains(&medium_max),
767 "Medium max_symbols should be ~100, got {}",
768 medium_max
769 );
770 assert!(
771 (400..=500).contains(&large_max),
772 "Large max_symbols should be ~500, got {}",
773 large_max
774 );
775 }
776
777 #[test]
778 fn test_map_budget_min_max_clamped() {
779 use crate::constants::repomap as consts;
780
781 let tiny = RepoMapGenerator::new(10);
783 assert_eq!(tiny.effective_max_symbols(), consts::MIN_SYMBOLS);
784
785 let huge = RepoMapGenerator::new(100_000);
787 assert_eq!(huge.effective_max_symbols(), consts::MAX_SYMBOLS);
788 }
789}