infiniloom_engine/repomap/
mod.rs

1//! Repository map generation with PageRank-based symbol ranking
2
3mod graph;
4
5// ============================================================================
6// Token estimation constants for budget calculations
7// ============================================================================
8// These values are derived from empirical analysis of typical repository maps:
9// - Symbol entries include name, kind, file, line, signature (~25 tokens avg)
10// - File entries include path, tokens, importance (~10 tokens avg)
11// - Overhead covers headers, summary text, formatting (~100 tokens)
12
13/// Estimated tokens per symbol entry in the repository map
14const TOKENS_PER_SYMBOL: u32 = 25;
15
16/// Estimated tokens per file entry in the file index
17const TOKENS_PER_FILE: u32 = 10;
18
19/// Fixed token overhead for headers, summary, and formatting
20const TOKEN_OVERHEAD: u32 = 100;
21
22/// Divisor for computing max symbols from budget (includes safety margin)
23/// Formula: max_symbols = budget / BUDGET_SYMBOL_DIVISOR
24/// Bug #7 fix: Reduced from 30 to 20 for more pronounced effect
25const BUDGET_SYMBOL_DIVISOR: usize = 20;
26const SUMMARY_MAX_LEN: usize = 120;
27
28fn summarize_symbol(symbol: &Symbol) -> Option<String> {
29    let line = symbol
30        .docstring
31        .as_deref()
32        .and_then(first_nonempty_line)
33        .or_else(|| symbol.signature.as_deref().and_then(first_nonempty_line));
34
35    line.map(|text| truncate_summary(text, SUMMARY_MAX_LEN))
36}
37
38fn first_nonempty_line(text: &str) -> Option<&str> {
39    text.lines()
40        .map(|line| line.trim())
41        .find(|line| !line.is_empty())
42}
43
44fn truncate_summary(text: &str, max_len: usize) -> String {
45    if text.chars().count() <= max_len {
46        return text.to_owned();
47    }
48
49    let mut truncated: String = text.chars().take(max_len).collect();
50    truncated = truncated.trim_end().to_owned();
51    truncated.push_str("...");
52    truncated
53}
54
55#[cfg(test)]
56use crate::types::RepoFile;
57use crate::types::{Repository, Symbol, SymbolKind, TokenizerModel};
58use graph::SymbolGraph;
59use serde::Serialize;
60use std::collections::HashMap;
61use std::fmt;
62use typed_builder::TypedBuilder;
63
64/// A repository map - a concise summary of the codebase
65#[derive(Debug, Clone, Serialize)]
66pub struct RepoMap {
67    /// Text summary of the repository
68    pub summary: String,
69    /// Most important symbols ranked by PageRank
70    pub key_symbols: Vec<RankedSymbol>,
71    /// Module/directory dependency graph
72    pub module_graph: ModuleGraph,
73    /// Index of all files with metadata
74    pub file_index: Vec<FileIndexEntry>,
75    /// Total token count for this map
76    pub token_count: u32,
77}
78
79impl fmt::Display for RepoMap {
80    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
81        write!(
82            f,
83            "RepoMap({} symbols, {} files, {} tokens)",
84            self.key_symbols.len(),
85            self.file_index.len(),
86            self.token_count
87        )
88    }
89}
90
91/// A symbol with its computed rank
92#[derive(Debug, Clone, Serialize)]
93pub struct RankedSymbol {
94    /// Symbol name
95    pub name: String,
96    /// Symbol kind
97    pub kind: String,
98    /// File containing the symbol
99    pub file: String,
100    /// Line number
101    pub line: u32,
102    /// Function/method signature
103    pub signature: Option<String>,
104    /// Short summary (docstring or signature)
105    pub summary: Option<String>,
106    /// Number of references
107    pub references: u32,
108    /// Rank (1 = most important)
109    pub rank: u32,
110    /// Importance score (0.0 - 1.0)
111    pub importance: f32,
112}
113
114/// Graph of module dependencies
115#[derive(Debug, Clone, Serialize)]
116pub struct ModuleGraph {
117    /// Module nodes
118    pub nodes: Vec<ModuleNode>,
119    /// Dependency edges
120    pub edges: Vec<ModuleEdge>,
121}
122
123/// A module/directory node
124#[derive(Debug, Clone, Serialize)]
125pub struct ModuleNode {
126    /// Module name (usually directory name)
127    pub name: String,
128    /// Number of files in module
129    pub files: u32,
130    /// Total tokens in module
131    pub tokens: u32,
132}
133
134/// A dependency edge between modules
135#[derive(Debug, Clone, Serialize)]
136pub struct ModuleEdge {
137    /// Source module
138    pub from: String,
139    /// Target module
140    pub to: String,
141    /// Number of imports/references
142    pub weight: u32,
143}
144
145/// File index entry
146#[derive(Debug, Clone, Serialize)]
147pub struct FileIndexEntry {
148    /// Relative file path
149    pub path: String,
150    /// Token count
151    pub tokens: u32,
152    /// Importance level (critical/high/normal/low)
153    pub importance: String,
154    /// Brief summary (optional)
155    pub summary: Option<String>,
156}
157
158/// Generator for repository maps
159///
160/// # Examples
161///
162/// ```rust,ignore
163/// use infiniloom_engine::{RepoMapGenerator, TokenizerModel};
164///
165/// // Use builder pattern
166/// let generator = RepoMapGenerator::builder()
167///     .token_budget(50_000)
168///     .model(TokenizerModel::Gpt4o)
169///     .build();
170///
171/// // Or use the convenience constructor
172/// let generator = RepoMapGenerator::new(2000);
173/// ```
174#[derive(Debug, Clone, TypedBuilder)]
175pub struct RepoMapGenerator {
176    /// Maximum number of symbols to include (default: computed from budget)
177    #[builder(default, setter(strip_option))]
178    max_symbols: Option<usize>,
179
180    /// Target model for token counting (default: Claude)
181    #[builder(default = TokenizerModel::Claude)]
182    model: TokenizerModel,
183
184    /// Token budget for the map (default: 2000)
185    #[builder(default = 2000)]
186    token_budget: u32,
187}
188
189impl RepoMapGenerator {
190    /// Create a new generator with token budget
191    /// The token budget influences how many symbols are included in the map.
192    /// Approximately 25 tokens per symbol entry, so max_symbols = budget / 30 (with overhead)
193    pub fn new(token_budget: u32) -> Self {
194        Self { max_symbols: None, model: TokenizerModel::Claude, token_budget }
195    }
196
197    /// Get effective max symbols (computed from budget if not set)
198    ///
199    /// Bug #7 fix: Increased max cap from 200 to 500 and made formula more responsive
200    fn effective_max_symbols(&self) -> usize {
201        self.max_symbols.unwrap_or_else(|| {
202            // Use constants for token estimation (see module-level documentation)
203            // Bug #7 fix: Increased max from 200 to 500 for larger budgets
204            ((self.token_budget as usize) / BUDGET_SYMBOL_DIVISOR).clamp(5, 500)
205        })
206    }
207
208    /// Generate a repository map with PageRank-based symbol ranking.
209    ///
210    /// The generated map includes:
211    /// - Key symbols ranked by importance (PageRank algorithm)
212    /// - Module dependency graph
213    /// - File index with importance levels
214    /// - Summary statistics
215    ///
216    /// The map is optimized for the configured token budget.
217    #[must_use = "repository map should be used or formatted"]
218    pub fn generate(&self, repo: &Repository) -> RepoMap {
219        // Build symbol graph
220        let mut graph = SymbolGraph::new();
221        for file in &repo.files {
222            graph.add_file(file);
223        }
224
225        // Build lookup index for fast import resolution
226        let symbol_index = self.build_symbol_index(repo);
227
228        // Extract references from symbols using index
229        self.extract_references_fast(&mut graph, repo, &symbol_index);
230
231        // Compute PageRank once
232        let ranks = graph.compute_pagerank(0.85, 20); // Reduced iterations for speed
233
234        // Get top symbols using pre-computed ranks
235        let key_symbols = self.build_ranked_symbols_fast(&graph, &ranks);
236
237        // Build module graph
238        let module_graph = self.build_module_graph(repo);
239
240        // Build file index (with token budget enforcement based on symbol count)
241        let file_index = self.build_file_index(repo, key_symbols.len());
242
243        // Generate summary
244        let summary = self.generate_summary(repo, &key_symbols);
245
246        // Estimate token count
247        let token_count = self.estimate_tokens(&key_symbols, &file_index);
248
249        RepoMap { summary, key_symbols, module_graph, file_index, token_count }
250    }
251
252    /// Build an index of symbols for fast lookup
253    ///
254    /// Uses multi-value index to handle symbol name collisions across files.
255    /// For example, multiple files may have a `main` function - all are indexed.
256    fn build_symbol_index(&self, repo: &Repository) -> HashMap<String, Vec<String>> {
257        let mut index: HashMap<String, Vec<String>> = HashMap::new();
258        for file in &repo.files {
259            // Index by file path (without extension) - supports all 22 languages
260            let path_key = Self::strip_language_extension(&file.relative_path);
261
262            for symbol in &file.symbols {
263                let symbol_key = format!("{}:{}", file.relative_path, symbol.name);
264
265                // Index by symbol name (multi-value to handle collisions)
266                index
267                    .entry(symbol.name.clone())
268                    .or_default()
269                    .push(symbol_key.clone());
270
271                // Index by path component (only for the first symbol to maintain backwards compat)
272                // This allows import resolution like "from utils import foo" -> "utils.py:foo"
273                index
274                    .entry(path_key.to_owned())
275                    .or_default()
276                    .push(symbol_key);
277            }
278        }
279        index
280    }
281
282    /// Fast reference extraction using pre-built index
283    ///
284    /// Uses multi-value index to properly resolve all matching symbols when
285    /// there are name collisions across files.
286    fn extract_references_fast(
287        &self,
288        graph: &mut SymbolGraph,
289        repo: &Repository,
290        index: &HashMap<String, Vec<String>>,
291    ) {
292        for file in &repo.files {
293            for symbol in &file.symbols {
294                let from_key = format!("{}:{}", file.relative_path, symbol.name);
295
296                // Extract import references
297                if symbol.kind == SymbolKind::Import {
298                    // Fast lookup using index - connect to all matching symbols
299                    if let Some(targets) = index.get(&symbol.name) {
300                        for target in targets {
301                            // Don't create self-references
302                            if target != &from_key {
303                                graph.add_reference(&from_key, target, graph::EdgeType::Imports);
304                            }
305                        }
306                    }
307                }
308
309                // Extract inheritance relationships (extends)
310                if let Some(ref base_class) = symbol.extends {
311                    // Try fast lookup first, then fallback to name-based search
312                    if let Some(targets) = index.get(base_class) {
313                        for target in targets {
314                            if target != &from_key {
315                                graph.add_reference(&from_key, target, graph::EdgeType::Inherits);
316                            }
317                        }
318                    } else {
319                        graph.add_reference_by_name(
320                            &from_key,
321                            base_class,
322                            graph::EdgeType::Inherits,
323                        );
324                    }
325                }
326
327                // Extract interface/trait implementations
328                for iface in &symbol.implements {
329                    if let Some(targets) = index.get(iface) {
330                        for target in targets {
331                            if target != &from_key {
332                                graph.add_reference(&from_key, target, graph::EdgeType::Implements);
333                            }
334                        }
335                    } else {
336                        graph.add_reference_by_name(&from_key, iface, graph::EdgeType::Implements);
337                    }
338                }
339
340                // Extract function call relationships
341                for called_name in &symbol.calls {
342                    if let Some(targets) = index.get(called_name) {
343                        for target in targets {
344                            if target != &from_key {
345                                graph.add_reference(&from_key, target, graph::EdgeType::Calls);
346                            }
347                        }
348                    } else {
349                        graph.add_reference_by_name(&from_key, called_name, graph::EdgeType::Calls);
350                    }
351                }
352            }
353        }
354    }
355
356    /// Build ranked symbols using pre-computed ranks
357    fn build_ranked_symbols_fast(
358        &self,
359        graph: &SymbolGraph,
360        ranks: &HashMap<String, f64>,
361    ) -> Vec<RankedSymbol> {
362        let top_nodes = graph.get_top_symbols_with_ranks(ranks, self.effective_max_symbols());
363
364        top_nodes
365            .iter()
366            .enumerate()
367            .map(|(i, node)| {
368                let key = format!("{}:{}", node.file_path, node.symbol.name);
369                let rank_score = ranks.get(&key).copied().unwrap_or(0.0);
370
371                RankedSymbol {
372                    name: node.symbol.name.clone(),
373                    kind: node.symbol.kind.name().to_owned(),
374                    file: node.file_path.clone(),
375                    line: node.symbol.start_line,
376                    signature: node.symbol.signature.clone(),
377                    summary: summarize_symbol(&node.symbol),
378                    references: node.symbol.references,
379                    rank: (i + 1) as u32,
380                    importance: rank_score as f32,
381                }
382            })
383            .collect()
384    }
385
386    fn build_module_graph(&self, repo: &Repository) -> ModuleGraph {
387        let mut modules: HashMap<String, ModuleNode> = HashMap::new();
388        let mut edge_counts: HashMap<(String, String), u32> = HashMap::new();
389
390        // Build file index by module (first pass)
391        for file in &repo.files {
392            let module = file
393                .relative_path
394                .split('/')
395                .next()
396                .unwrap_or("root")
397                .to_owned();
398
399            let entry = modules.entry(module.clone()).or_insert(ModuleNode {
400                name: module.clone(),
401                files: 0,
402                tokens: 0,
403            });
404
405            entry.files += 1;
406            entry.tokens += file.token_count.get(self.model);
407        }
408
409        // Build a map of symbol names to their modules for cross-reference
410        let mut symbol_to_module: HashMap<String, String> = HashMap::new();
411        for file in &repo.files {
412            let module = file
413                .relative_path
414                .split('/')
415                .next()
416                .unwrap_or("root")
417                .to_owned();
418
419            for symbol in &file.symbols {
420                symbol_to_module.insert(symbol.name.clone(), module.clone());
421            }
422        }
423
424        // Compute edges based on imports and calls between modules
425        for file in &repo.files {
426            let from_module = file
427                .relative_path
428                .split('/')
429                .next()
430                .unwrap_or("root")
431                .to_owned();
432
433            for symbol in &file.symbols {
434                // Count import references to other modules
435                if symbol.kind == SymbolKind::Import {
436                    if let Some(target_module) = symbol_to_module.get(&symbol.name) {
437                        if target_module != &from_module {
438                            *edge_counts
439                                .entry((from_module.clone(), target_module.clone()))
440                                .or_insert(0) += 1;
441                        }
442                    }
443                }
444
445                // Count function calls to other modules
446                for called_name in &symbol.calls {
447                    if let Some(target_module) = symbol_to_module.get(called_name) {
448                        if target_module != &from_module {
449                            *edge_counts
450                                .entry((from_module.clone(), target_module.clone()))
451                                .or_insert(0) += 1;
452                        }
453                    }
454                }
455            }
456        }
457
458        // Convert edge counts to edges
459        let edges: Vec<ModuleEdge> = edge_counts
460            .into_iter()
461            .map(|((from, to), weight)| ModuleEdge { from, to, weight })
462            .collect();
463
464        ModuleGraph { nodes: modules.into_values().collect(), edges }
465    }
466
467    fn build_file_index(&self, repo: &Repository, symbol_count: usize) -> Vec<FileIndexEntry> {
468        let mut files: Vec<_> = repo
469            .files
470            .iter()
471            .map(|f| {
472                let importance = if f.importance > 0.8 {
473                    "critical"
474                } else if f.importance > 0.6 {
475                    "high"
476                } else if f.importance > 0.3 {
477                    "normal"
478                } else {
479                    "low"
480                };
481
482                FileIndexEntry {
483                    path: f.relative_path.clone(),
484                    tokens: f.token_count.get(self.model),
485                    importance: importance.to_owned(),
486                    summary: None,
487                }
488            })
489            .collect();
490
491        // Sort by importance
492        files.sort_by(|a, b| {
493            let a_imp = match a.importance.as_str() {
494                "critical" => 4,
495                "high" => 3,
496                "normal" => 2,
497                _ => 1,
498            };
499            let b_imp = match b.importance.as_str() {
500                "critical" => 4,
501                "high" => 3,
502                "normal" => 2,
503                _ => 1,
504            };
505            b_imp.cmp(&a_imp)
506        });
507
508        // Enforce token budget: limit file entries based on remaining budget
509        // Uses constants defined at module level for token estimation
510        let symbol_tokens = symbol_count as u32 * TOKENS_PER_SYMBOL;
511        let remaining_budget = self
512            .token_budget
513            .saturating_sub(symbol_tokens)
514            .saturating_sub(TOKEN_OVERHEAD);
515        let max_files = (remaining_budget / TOKENS_PER_FILE) as usize;
516
517        if files.len() > max_files && max_files > 0 {
518            files.truncate(max_files);
519        }
520
521        files
522    }
523
524    fn generate_summary(&self, repo: &Repository, symbols: &[RankedSymbol]) -> String {
525        // Deduplicate modules - take top symbols until we have 3 unique modules
526        let mut seen = std::collections::HashSet::new();
527        let top_modules: Vec<_> = symbols
528            .iter()
529            .filter_map(|s| s.file.split('/').next())
530            .filter(|m| seen.insert(*m))
531            .take(3)
532            .collect();
533
534        let primary_lang = repo
535            .metadata
536            .languages
537            .first()
538            .map(|l| l.language.as_str())
539            .unwrap_or("unknown");
540
541        format!(
542            "Repository: {} ({} files, {} lines)\n\
543             Primary language: {}\n\
544             Key modules: {}",
545            repo.name,
546            repo.metadata.total_files,
547            repo.metadata.total_lines,
548            primary_lang,
549            top_modules.join(", ")
550        )
551    }
552
553    fn estimate_tokens(&self, symbols: &[RankedSymbol], files: &[FileIndexEntry]) -> u32 {
554        // Uses constants defined at module level for token estimation
555        let symbol_tokens = symbols.len() as u32 * TOKENS_PER_SYMBOL;
556        let file_tokens = files.len() as u32 * TOKENS_PER_FILE;
557
558        symbol_tokens + file_tokens + TOKEN_OVERHEAD
559    }
560
561    /// Strip language extension from a file path for indexing
562    /// Supports all 22 languages from the parser module
563    fn strip_language_extension(path: &str) -> &str {
564        // Extensions ordered by specificity (longer first to avoid partial matches)
565        const EXTENSIONS: &[&str] = &[
566            // Multi-char extensions first
567            ".gemspec",
568            ".fsscript",
569            // TypeScript/JavaScript
570            ".tsx",
571            ".jsx",
572            ".mjs",
573            ".cjs",
574            ".ts",
575            ".js",
576            // C/C++
577            ".cpp",
578            ".cxx",
579            ".hpp",
580            ".hxx",
581            ".cc",
582            ".hh",
583            ".c",
584            ".h",
585            // C#
586            ".cs",
587            // Ruby
588            ".rb",
589            ".rake",
590            // Shell
591            ".bash",
592            ".zsh",
593            ".fish",
594            ".sh",
595            // PHP
596            ".phtml",
597            ".php3",
598            ".php4",
599            ".php5",
600            ".phps",
601            ".php",
602            // Kotlin
603            ".kts",
604            ".kt",
605            // Swift
606            ".swift",
607            // Scala
608            ".scala",
609            ".sc",
610            // Haskell
611            ".lhs",
612            ".hs",
613            // Elixir
614            ".heex",
615            ".leex",
616            ".exs",
617            ".eex",
618            ".ex",
619            // Clojure
620            ".cljs",
621            ".cljc",
622            ".edn",
623            ".clj",
624            // OCaml
625            ".mli",
626            ".ml",
627            // F#
628            ".fsx",
629            ".fsi",
630            ".fs",
631            // Lua
632            ".lua",
633            // R
634            ".rmd",
635            ".r",
636            // Python
637            ".pyw",
638            ".py",
639            // Rust
640            ".rs",
641            // Go
642            ".go",
643            // Java
644            ".java",
645        ];
646
647        for ext in EXTENSIONS {
648            if let Some(stripped) = path.strip_suffix(ext) {
649                return stripped;
650            }
651        }
652        path
653    }
654}
655
656#[cfg(test)]
657#[allow(clippy::str_to_string)]
658mod tests {
659    use super::*;
660    use crate::types::{RepoMetadata, TokenCounts};
661
662    fn create_test_repo() -> Repository {
663        Repository {
664            name: "test-repo".to_owned(),
665            path: "/tmp/test".into(),
666            files: vec![RepoFile {
667                path: "/tmp/test/src/main.py".into(),
668                relative_path: "src/main.py".to_string(),
669                language: Some("python".to_string()),
670                size_bytes: 1000,
671                token_count: TokenCounts {
672                    o200k: 240,
673                    cl100k: 245,
674                    claude: 250,
675                    gemini: 230,
676                    llama: 235,
677                    mistral: 235,
678                    deepseek: 235,
679                    qwen: 235,
680                    cohere: 238,
681                    grok: 235,
682                },
683                symbols: vec![Symbol {
684                    name: "main".to_string(),
685                    kind: SymbolKind::Function,
686                    signature: Some("def main() -> None".to_string()),
687                    docstring: Some("Entry point".to_string()),
688                    start_line: 10,
689                    end_line: 25,
690                    references: 5,
691                    importance: 0.9,
692                    parent: None,
693                    visibility: crate::types::Visibility::Public,
694                    calls: vec!["helper".to_string(), "process".to_string()],
695                    extends: None,
696                    implements: vec![],
697                }],
698                importance: 0.9,
699                content: None,
700            }],
701            metadata: RepoMetadata {
702                total_files: 1,
703                total_lines: 100,
704                total_tokens: TokenCounts {
705                    o200k: 240,
706                    cl100k: 245,
707                    claude: 250,
708                    gemini: 230,
709                    llama: 235,
710                    mistral: 235,
711                    deepseek: 235,
712                    qwen: 235,
713                    cohere: 238,
714                    grok: 235,
715                },
716                languages: vec![crate::types::LanguageStats {
717                    language: "Python".to_string(),
718                    files: 1,
719                    lines: 100,
720                    percentage: 100.0,
721                }],
722                framework: None,
723                description: None,
724                branch: None,
725                commit: None,
726                directory_structure: None,
727                external_dependencies: vec![],
728                git_history: None,
729            },
730        }
731    }
732
733    #[test]
734    fn test_generate_repomap() {
735        let repo = create_test_repo();
736        let generator = RepoMapGenerator::new(2000);
737        let map = generator.generate(&repo);
738
739        assert!(!map.summary.is_empty());
740        assert!(!map.file_index.is_empty());
741    }
742
743    // Bug #7 test - verify mapBudget has significant effect
744    #[test]
745    fn test_map_budget_affects_output() {
746        // Test that different budgets produce different effective_max_symbols
747        let small = RepoMapGenerator::new(500);
748        let medium = RepoMapGenerator::new(2000);
749        let large = RepoMapGenerator::new(10000);
750
751        let small_max = small.effective_max_symbols();
752        let medium_max = medium.effective_max_symbols();
753        let large_max = large.effective_max_symbols();
754
755        // Small budget should have fewer max symbols
756        assert!(
757            small_max < medium_max,
758            "Small budget ({}) should have fewer symbols ({}) than medium ({}) with ({})",
759            500,
760            small_max,
761            2000,
762            medium_max
763        );
764
765        // Large budget should have more max symbols
766        assert!(
767            medium_max < large_max,
768            "Medium budget ({}) should have fewer symbols ({}) than large ({}) with ({})",
769            2000,
770            medium_max,
771            10000,
772            large_max
773        );
774
775        // Verify the actual values are reasonable
776        // With BUDGET_SYMBOL_DIVISOR = 20:
777        // 500 / 20 = 25
778        // 2000 / 20 = 100
779        // 10000 / 20 = 500
780        assert!(small_max >= 20 && small_max <= 30, "Small max_symbols should be ~25, got {}", small_max);
781        assert!(medium_max >= 90 && medium_max <= 110, "Medium max_symbols should be ~100, got {}", medium_max);
782        assert!(large_max >= 400 && large_max <= 500, "Large max_symbols should be ~500, got {}", large_max);
783    }
784
785    #[test]
786    fn test_map_budget_min_max_clamped() {
787        // Very small budget should clamp to minimum
788        let tiny = RepoMapGenerator::new(10);
789        assert_eq!(tiny.effective_max_symbols(), 5);
790
791        // Very large budget should clamp to maximum
792        let huge = RepoMapGenerator::new(100_000);
793        assert_eq!(huge.effective_max_symbols(), 500);
794    }
795}