1mod graph;
4
5use crate::constants::repomap as repomap_consts;
6
7fn summarize_symbol(symbol: &Symbol) -> Option<String> {
8 let line = symbol
9 .docstring
10 .as_deref()
11 .and_then(first_nonempty_line)
12 .or_else(|| symbol.signature.as_deref().and_then(first_nonempty_line));
13
14 line.map(|text| truncate_summary(text, repomap_consts::SUMMARY_MAX_LEN))
15}
16
17fn first_nonempty_line(text: &str) -> Option<&str> {
18 text.lines()
19 .map(|line| line.trim())
20 .find(|line| !line.is_empty())
21}
22
23fn truncate_summary(text: &str, max_len: usize) -> String {
24 if text.chars().count() <= max_len {
25 return text.to_owned();
26 }
27
28 let mut truncated: String = text.chars().take(max_len).collect();
29 truncated = truncated.trim_end().to_owned();
30 truncated.push_str("...");
31 truncated
32}
33
34#[cfg(test)]
35use crate::types::RepoFile;
36use crate::types::{Repository, Symbol, SymbolKind, TokenizerModel};
37use graph::SymbolGraph;
38use serde::Serialize;
39use std::collections::HashMap;
40use std::fmt;
41use typed_builder::TypedBuilder;
42
43#[derive(Debug, Clone, Serialize)]
45pub struct RepoMap {
46 pub summary: String,
48 pub key_symbols: Vec<RankedSymbol>,
50 pub module_graph: ModuleGraph,
52 pub file_index: Vec<FileIndexEntry>,
54 pub token_count: u32,
56}
57
58impl fmt::Display for RepoMap {
59 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
60 write!(
61 f,
62 "RepoMap({} symbols, {} files, {} tokens)",
63 self.key_symbols.len(),
64 self.file_index.len(),
65 self.token_count
66 )
67 }
68}
69
70#[derive(Debug, Clone, Serialize)]
72pub struct RankedSymbol {
73 pub name: String,
75 pub kind: String,
77 pub file: String,
79 pub line: u32,
81 pub signature: Option<String>,
83 pub summary: Option<String>,
85 pub references: u32,
87 pub rank: u32,
89 pub importance: f32,
91}
92
93#[derive(Debug, Clone, Serialize)]
95pub struct ModuleGraph {
96 pub nodes: Vec<ModuleNode>,
98 pub edges: Vec<ModuleEdge>,
100}
101
102#[derive(Debug, Clone, Serialize)]
104pub struct ModuleNode {
105 pub name: String,
107 pub files: u32,
109 pub tokens: u32,
111}
112
113#[derive(Debug, Clone, Serialize)]
115pub struct ModuleEdge {
116 pub from: String,
118 pub to: String,
120 pub weight: u32,
122}
123
124#[derive(Debug, Clone, Serialize)]
126pub struct FileIndexEntry {
127 pub path: String,
129 pub tokens: u32,
131 pub importance: String,
133 pub summary: Option<String>,
135}
136
137#[derive(Debug, Clone, TypedBuilder)]
154pub struct RepoMapGenerator {
155 #[builder(default, setter(strip_option))]
157 max_symbols: Option<usize>,
158
159 #[builder(default = TokenizerModel::Claude)]
161 model: TokenizerModel,
162
163 #[builder(default = 2000)]
165 token_budget: u32,
166}
167
168impl RepoMapGenerator {
169 pub fn new(token_budget: u32) -> Self {
173 Self { max_symbols: None, model: TokenizerModel::Claude, token_budget }
174 }
175
176 fn effective_max_symbols(&self) -> usize {
180 self.max_symbols.unwrap_or_else(|| {
181 ((self.token_budget as usize) / repomap_consts::BUDGET_SYMBOL_DIVISOR)
184 .clamp(repomap_consts::MIN_SYMBOLS, repomap_consts::MAX_SYMBOLS)
185 })
186 }
187
188 #[must_use = "repository map should be used or formatted"]
198 pub fn generate(&self, repo: &Repository) -> RepoMap {
199 let mut graph = SymbolGraph::new();
201 for file in &repo.files {
202 graph.add_file(file);
203 }
204
205 let symbol_index = self.build_symbol_index(repo);
207
208 self.extract_references_fast(&mut graph, repo, &symbol_index);
210
211 let ranks = graph.compute_pagerank(0.85, 20); let key_symbols = self.build_ranked_symbols_fast(&graph, &ranks);
216
217 let module_graph = self.build_module_graph(repo);
219
220 let file_index = self.build_file_index(repo, key_symbols.len());
222
223 let summary = self.generate_summary(repo, &key_symbols);
225
226 let token_count = self.estimate_tokens(&key_symbols, &file_index);
228
229 RepoMap { summary, key_symbols, module_graph, file_index, token_count }
230 }
231
232 fn build_symbol_index(&self, repo: &Repository) -> HashMap<String, Vec<String>> {
237 let mut index: HashMap<String, Vec<String>> = HashMap::new();
238 for file in &repo.files {
239 let path_key = Self::strip_language_extension(&file.relative_path);
241
242 for symbol in &file.symbols {
243 let symbol_key = format!("{}:{}", file.relative_path, symbol.name);
244
245 index
247 .entry(symbol.name.clone())
248 .or_default()
249 .push(symbol_key.clone());
250
251 index
254 .entry(path_key.to_owned())
255 .or_default()
256 .push(symbol_key);
257 }
258 }
259 index
260 }
261
262 fn extract_references_fast(
267 &self,
268 graph: &mut SymbolGraph,
269 repo: &Repository,
270 index: &HashMap<String, Vec<String>>,
271 ) {
272 for file in &repo.files {
273 for symbol in &file.symbols {
274 let from_key = format!("{}:{}", file.relative_path, symbol.name);
275
276 if symbol.kind == SymbolKind::Import {
278 if let Some(targets) = index.get(&symbol.name) {
280 for target in targets {
281 if target != &from_key {
283 graph.add_reference(&from_key, target, graph::EdgeType::Imports);
284 }
285 }
286 }
287 }
288
289 if let Some(ref base_class) = symbol.extends {
291 if let Some(targets) = index.get(base_class) {
293 for target in targets {
294 if target != &from_key {
295 graph.add_reference(&from_key, target, graph::EdgeType::Inherits);
296 }
297 }
298 } else {
299 graph.add_reference_by_name(
300 &from_key,
301 base_class,
302 graph::EdgeType::Inherits,
303 );
304 }
305 }
306
307 for iface in &symbol.implements {
309 if let Some(targets) = index.get(iface) {
310 for target in targets {
311 if target != &from_key {
312 graph.add_reference(&from_key, target, graph::EdgeType::Implements);
313 }
314 }
315 } else {
316 graph.add_reference_by_name(&from_key, iface, graph::EdgeType::Implements);
317 }
318 }
319
320 for called_name in &symbol.calls {
322 if let Some(targets) = index.get(called_name) {
323 for target in targets {
324 if target != &from_key {
325 graph.add_reference(&from_key, target, graph::EdgeType::Calls);
326 }
327 }
328 } else {
329 graph.add_reference_by_name(&from_key, called_name, graph::EdgeType::Calls);
330 }
331 }
332 }
333 }
334 }
335
336 fn build_ranked_symbols_fast(
338 &self,
339 graph: &SymbolGraph,
340 ranks: &HashMap<String, f64>,
341 ) -> Vec<RankedSymbol> {
342 let top_nodes = graph.get_top_symbols_with_ranks(ranks, self.effective_max_symbols());
343
344 top_nodes
345 .iter()
346 .enumerate()
347 .map(|(i, node)| {
348 let key = format!("{}:{}", node.file_path, node.symbol.name);
349 let rank_score = ranks.get(&key).copied().unwrap_or(0.0);
350
351 RankedSymbol {
352 name: node.symbol.name.clone(),
353 kind: node.symbol.kind.name().to_owned(),
354 file: node.file_path.clone(),
355 line: node.symbol.start_line,
356 signature: node.symbol.signature.clone(),
357 summary: summarize_symbol(&node.symbol),
358 references: node.symbol.references,
359 rank: (i + 1) as u32,
360 importance: rank_score as f32,
361 }
362 })
363 .collect()
364 }
365
366 fn build_module_graph(&self, repo: &Repository) -> ModuleGraph {
367 let mut modules: HashMap<String, ModuleNode> = HashMap::new();
368 let mut edge_counts: HashMap<(String, String), u32> = HashMap::new();
369
370 for file in &repo.files {
372 let module = file
373 .relative_path
374 .split('/')
375 .next()
376 .unwrap_or("root")
377 .to_owned();
378
379 let entry = modules.entry(module.clone()).or_insert(ModuleNode {
380 name: module.clone(),
381 files: 0,
382 tokens: 0,
383 });
384
385 entry.files += 1;
386 entry.tokens += file.token_count.get(self.model);
387 }
388
389 let mut symbol_to_module: HashMap<String, String> = HashMap::new();
391 for file in &repo.files {
392 let module = file
393 .relative_path
394 .split('/')
395 .next()
396 .unwrap_or("root")
397 .to_owned();
398
399 for symbol in &file.symbols {
400 symbol_to_module.insert(symbol.name.clone(), module.clone());
401 }
402 }
403
404 for file in &repo.files {
406 let from_module = file
407 .relative_path
408 .split('/')
409 .next()
410 .unwrap_or("root")
411 .to_owned();
412
413 for symbol in &file.symbols {
414 if symbol.kind == SymbolKind::Import {
416 if let Some(target_module) = symbol_to_module.get(&symbol.name) {
417 if target_module != &from_module {
418 *edge_counts
419 .entry((from_module.clone(), target_module.clone()))
420 .or_insert(0) += 1;
421 }
422 }
423 }
424
425 for called_name in &symbol.calls {
427 if let Some(target_module) = symbol_to_module.get(called_name) {
428 if target_module != &from_module {
429 *edge_counts
430 .entry((from_module.clone(), target_module.clone()))
431 .or_insert(0) += 1;
432 }
433 }
434 }
435 }
436 }
437
438 let edges: Vec<ModuleEdge> = edge_counts
440 .into_iter()
441 .map(|((from, to), weight)| ModuleEdge { from, to, weight })
442 .collect();
443
444 ModuleGraph { nodes: modules.into_values().collect(), edges }
445 }
446
447 fn build_file_index(&self, repo: &Repository, symbol_count: usize) -> Vec<FileIndexEntry> {
448 let mut files: Vec<_> = repo
449 .files
450 .iter()
451 .map(|f| {
452 let importance = if f.importance > 0.8 {
453 "critical"
454 } else if f.importance > 0.6 {
455 "high"
456 } else if f.importance > 0.3 {
457 "normal"
458 } else {
459 "low"
460 };
461
462 FileIndexEntry {
463 path: f.relative_path.clone(),
464 tokens: f.token_count.get(self.model),
465 importance: importance.to_owned(),
466 summary: None,
467 }
468 })
469 .collect();
470
471 files.sort_by(|a, b| {
473 let a_imp = match a.importance.as_str() {
474 "critical" => 4,
475 "high" => 3,
476 "normal" => 2,
477 _ => 1,
478 };
479 let b_imp = match b.importance.as_str() {
480 "critical" => 4,
481 "high" => 3,
482 "normal" => 2,
483 _ => 1,
484 };
485 b_imp.cmp(&a_imp)
486 });
487
488 let symbol_tokens = symbol_count as u32 * repomap_consts::TOKENS_PER_SYMBOL;
491 let remaining_budget = self
492 .token_budget
493 .saturating_sub(symbol_tokens)
494 .saturating_sub(repomap_consts::TOKEN_OVERHEAD);
495 let max_files = (remaining_budget / repomap_consts::TOKENS_PER_FILE) as usize;
496
497 if files.len() > max_files && max_files > 0 {
498 files.truncate(max_files);
499 }
500
501 files
502 }
503
504 fn generate_summary(&self, repo: &Repository, symbols: &[RankedSymbol]) -> String {
505 let mut seen = std::collections::HashSet::new();
507 let top_modules: Vec<_> = symbols
508 .iter()
509 .filter_map(|s| s.file.split('/').next())
510 .filter(|m| seen.insert(*m))
511 .take(3)
512 .collect();
513
514 let primary_lang = repo
515 .metadata
516 .languages
517 .first()
518 .map_or("unknown", |l| l.language.as_str());
519
520 format!(
521 "Repository: {} ({} files, {} lines)\n\
522 Primary language: {}\n\
523 Key modules: {}",
524 repo.name,
525 repo.metadata.total_files,
526 repo.metadata.total_lines,
527 primary_lang,
528 top_modules.join(", ")
529 )
530 }
531
532 fn estimate_tokens(&self, symbols: &[RankedSymbol], files: &[FileIndexEntry]) -> u32 {
533 let symbol_tokens = symbols.len() as u32 * repomap_consts::TOKENS_PER_SYMBOL;
535 let file_tokens = files.len() as u32 * repomap_consts::TOKENS_PER_FILE;
536
537 symbol_tokens + file_tokens + repomap_consts::TOKEN_OVERHEAD
538 }
539
540 fn strip_language_extension(path: &str) -> &str {
543 const EXTENSIONS: &[&str] = &[
545 ".gemspec",
547 ".fsscript",
548 ".tsx",
550 ".jsx",
551 ".mjs",
552 ".cjs",
553 ".ts",
554 ".js",
555 ".cpp",
557 ".cxx",
558 ".hpp",
559 ".hxx",
560 ".cc",
561 ".hh",
562 ".c",
563 ".h",
564 ".cs",
566 ".rb",
568 ".rake",
569 ".bash",
571 ".zsh",
572 ".fish",
573 ".sh",
574 ".phtml",
576 ".php3",
577 ".php4",
578 ".php5",
579 ".phps",
580 ".php",
581 ".kts",
583 ".kt",
584 ".swift",
586 ".scala",
588 ".sc",
589 ".lhs",
591 ".hs",
592 ".heex",
594 ".leex",
595 ".exs",
596 ".eex",
597 ".ex",
598 ".cljs",
600 ".cljc",
601 ".edn",
602 ".clj",
603 ".mli",
605 ".ml",
606 ".fsx",
608 ".fsi",
609 ".fs",
610 ".lua",
612 ".rmd",
614 ".r",
615 ".pyw",
617 ".py",
618 ".rs",
620 ".go",
622 ".java",
624 ];
625
626 for ext in EXTENSIONS {
627 if let Some(stripped) = path.strip_suffix(ext) {
628 return stripped;
629 }
630 }
631 path
632 }
633}
634
635#[cfg(test)]
636#[allow(clippy::str_to_string)]
637mod tests {
638 use super::*;
639 use crate::types::{RepoMetadata, TokenCounts};
640
641 fn create_test_repo() -> Repository {
642 Repository {
643 name: "test-repo".to_owned(),
644 path: "/tmp/test".into(),
645 files: vec![RepoFile {
646 path: "/tmp/test/src/main.py".into(),
647 relative_path: "src/main.py".to_string(),
648 language: Some("python".to_string()),
649 size_bytes: 1000,
650 token_count: TokenCounts {
651 o200k: 240,
652 cl100k: 245,
653 claude: 250,
654 gemini: 230,
655 llama: 235,
656 mistral: 235,
657 deepseek: 235,
658 qwen: 235,
659 cohere: 238,
660 grok: 235,
661 },
662 symbols: vec![Symbol {
663 name: "main".to_string(),
664 kind: SymbolKind::Function,
665 signature: Some("def main() -> None".to_string()),
666 docstring: Some("Entry point".to_string()),
667 start_line: 10,
668 end_line: 25,
669 references: 5,
670 importance: 0.9,
671 parent: None,
672 visibility: crate::types::Visibility::Public,
673 calls: vec!["helper".to_string(), "process".to_string()],
674 extends: None,
675 implements: vec![],
676 }],
677 importance: 0.9,
678 content: None,
679 }],
680 metadata: RepoMetadata {
681 total_files: 1,
682 total_lines: 100,
683 total_tokens: TokenCounts {
684 o200k: 240,
685 cl100k: 245,
686 claude: 250,
687 gemini: 230,
688 llama: 235,
689 mistral: 235,
690 deepseek: 235,
691 qwen: 235,
692 cohere: 238,
693 grok: 235,
694 },
695 languages: vec![crate::types::LanguageStats {
696 language: "Python".to_string(),
697 files: 1,
698 lines: 100,
699 percentage: 100.0,
700 }],
701 framework: None,
702 description: None,
703 branch: None,
704 commit: None,
705 directory_structure: None,
706 external_dependencies: vec![],
707 git_history: None,
708 },
709 }
710 }
711
712 #[test]
713 fn test_generate_repomap() {
714 let repo = create_test_repo();
715 let generator = RepoMapGenerator::new(2000);
716 let map = generator.generate(&repo);
717
718 assert!(!map.summary.is_empty());
719 assert!(!map.file_index.is_empty());
720 }
721
722 #[test]
724 fn test_map_budget_affects_output() {
725 let small = RepoMapGenerator::new(500);
727 let medium = RepoMapGenerator::new(2000);
728 let large = RepoMapGenerator::new(10000);
729
730 let small_max = small.effective_max_symbols();
731 let medium_max = medium.effective_max_symbols();
732 let large_max = large.effective_max_symbols();
733
734 assert!(
736 small_max < medium_max,
737 "Small budget ({}) should have fewer symbols ({}) than medium ({}) with ({})",
738 500,
739 small_max,
740 2000,
741 medium_max
742 );
743
744 assert!(
746 medium_max < large_max,
747 "Medium budget ({}) should have fewer symbols ({}) than large ({}) with ({})",
748 2000,
749 medium_max,
750 10000,
751 large_max
752 );
753
754 assert!(
760 (20..=30).contains(&small_max),
761 "Small max_symbols should be ~25, got {}",
762 small_max
763 );
764 assert!(
765 (90..=110).contains(&medium_max),
766 "Medium max_symbols should be ~100, got {}",
767 medium_max
768 );
769 assert!(
770 (400..=500).contains(&large_max),
771 "Large max_symbols should be ~500, got {}",
772 large_max
773 );
774 }
775
776 #[test]
777 fn test_map_budget_min_max_clamped() {
778 use crate::constants::repomap as consts;
779
780 let tiny = RepoMapGenerator::new(10);
782 assert_eq!(tiny.effective_max_symbols(), consts::MIN_SYMBOLS);
783
784 let huge = RepoMapGenerator::new(100_000);
786 assert_eq!(huge.effective_max_symbols(), consts::MAX_SYMBOLS);
787 }
788}