Skip to main content

infiniloom_engine/repomap/
mod.rs

1//! Repository map generation with PageRank-based symbol ranking
2
3mod graph;
4
5use crate::constants::repomap as repomap_consts;
6
7fn summarize_symbol(symbol: &Symbol) -> Option<String> {
8    let line = symbol
9        .docstring
10        .as_deref()
11        .and_then(first_nonempty_line)
12        .or_else(|| symbol.signature.as_deref().and_then(first_nonempty_line));
13
14    line.map(|text| truncate_summary(text, repomap_consts::SUMMARY_MAX_LEN))
15}
16
17fn first_nonempty_line(text: &str) -> Option<&str> {
18    text.lines()
19        .map(|line| line.trim())
20        .find(|line| !line.is_empty())
21}
22
23fn truncate_summary(text: &str, max_len: usize) -> String {
24    if text.chars().count() <= max_len {
25        return text.to_owned();
26    }
27
28    let mut truncated: String = text.chars().take(max_len).collect();
29    truncated = truncated.trim_end().to_owned();
30    truncated.push_str("...");
31    truncated
32}
33
34#[cfg(test)]
35use crate::types::RepoFile;
36use crate::types::{Repository, Symbol, SymbolKind, TokenizerModel};
37use graph::SymbolGraph;
38use serde::Serialize;
39use std::collections::HashMap;
40use std::fmt;
41use typed_builder::TypedBuilder;
42
43/// A repository map - a concise summary of the codebase
44#[derive(Debug, Clone, Serialize)]
45pub struct RepoMap {
46    /// Text summary of the repository
47    pub summary: String,
48    /// Most important symbols ranked by PageRank
49    pub key_symbols: Vec<RankedSymbol>,
50    /// Module/directory dependency graph
51    pub module_graph: ModuleGraph,
52    /// Index of all files with metadata
53    pub file_index: Vec<FileIndexEntry>,
54    /// Total token count for this map
55    pub token_count: u32,
56}
57
58impl fmt::Display for RepoMap {
59    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
60        write!(
61            f,
62            "RepoMap({} symbols, {} files, {} tokens)",
63            self.key_symbols.len(),
64            self.file_index.len(),
65            self.token_count
66        )
67    }
68}
69
70/// A symbol with its computed rank
71#[derive(Debug, Clone, Serialize)]
72pub struct RankedSymbol {
73    /// Symbol name
74    pub name: String,
75    /// Symbol kind
76    pub kind: String,
77    /// File containing the symbol
78    pub file: String,
79    /// Line number
80    pub line: u32,
81    /// Function/method signature
82    pub signature: Option<String>,
83    /// Short summary (docstring or signature)
84    pub summary: Option<String>,
85    /// Number of references
86    pub references: u32,
87    /// Rank (1 = most important)
88    pub rank: u32,
89    /// Importance score (0.0 - 1.0)
90    pub importance: f32,
91}
92
93/// Graph of module dependencies
94#[derive(Debug, Clone, Serialize)]
95pub struct ModuleGraph {
96    /// Module nodes
97    pub nodes: Vec<ModuleNode>,
98    /// Dependency edges
99    pub edges: Vec<ModuleEdge>,
100}
101
102/// A module/directory node
103#[derive(Debug, Clone, Serialize)]
104pub struct ModuleNode {
105    /// Module name (usually directory name)
106    pub name: String,
107    /// Number of files in module
108    pub files: u32,
109    /// Total tokens in module
110    pub tokens: u32,
111}
112
113/// A dependency edge between modules
114#[derive(Debug, Clone, Serialize)]
115pub struct ModuleEdge {
116    /// Source module
117    pub from: String,
118    /// Target module
119    pub to: String,
120    /// Number of imports/references
121    pub weight: u32,
122}
123
124/// File index entry
125#[derive(Debug, Clone, Serialize)]
126pub struct FileIndexEntry {
127    /// Relative file path
128    pub path: String,
129    /// Token count
130    pub tokens: u32,
131    /// Importance level (critical/high/normal/low)
132    pub importance: String,
133    /// Brief summary (optional)
134    pub summary: Option<String>,
135}
136
137/// Generator for repository maps
138///
139/// # Examples
140///
141/// ```rust,ignore
142/// use infiniloom_engine::{RepoMapGenerator, TokenizerModel};
143///
144/// // Use builder pattern
145/// let generator = RepoMapGenerator::builder()
146///     .token_budget(50_000)
147///     .model(TokenizerModel::Gpt4o)
148///     .build();
149///
150/// // Or use the convenience constructor
151/// let generator = RepoMapGenerator::new(2000);
152/// ```
153#[derive(Debug, Clone, TypedBuilder)]
154pub struct RepoMapGenerator {
155    /// Maximum number of symbols to include (default: computed from budget)
156    #[builder(default, setter(strip_option))]
157    max_symbols: Option<usize>,
158
159    /// Target model for token counting (default: Claude)
160    #[builder(default = TokenizerModel::Claude)]
161    model: TokenizerModel,
162
163    /// Token budget for the map (default: 2000)
164    #[builder(default = 2000)]
165    token_budget: u32,
166}
167
168impl RepoMapGenerator {
169    /// Create a new generator with token budget
170    /// The token budget influences how many symbols are included in the map.
171    /// Approximately 25 tokens per symbol entry, so max_symbols = budget / 30 (with overhead)
172    pub fn new(token_budget: u32) -> Self {
173        Self { max_symbols: None, model: TokenizerModel::Claude, token_budget }
174    }
175
176    /// Get effective max symbols (computed from budget if not set)
177    ///
178    /// Bug #7 fix: Increased max cap from 200 to 500 and made formula more responsive
179    fn effective_max_symbols(&self) -> usize {
180        self.max_symbols.unwrap_or_else(|| {
181            // Use constants for token estimation (see module-level documentation)
182            // Bug #7 fix: Increased max from 200 to 500 for larger budgets
183            ((self.token_budget as usize) / repomap_consts::BUDGET_SYMBOL_DIVISOR)
184                .clamp(repomap_consts::MIN_SYMBOLS, repomap_consts::MAX_SYMBOLS)
185        })
186    }
187
188    /// Generate a repository map with PageRank-based symbol ranking.
189    ///
190    /// The generated map includes:
191    /// - Key symbols ranked by importance (PageRank algorithm)
192    /// - Module dependency graph
193    /// - File index with importance levels
194    /// - Summary statistics
195    ///
196    /// The map is optimized for the configured token budget.
197    #[must_use = "repository map should be used or formatted"]
198    pub fn generate(&self, repo: &Repository) -> RepoMap {
199        // Build symbol graph
200        let mut graph = SymbolGraph::new();
201        for file in &repo.files {
202            graph.add_file(file);
203        }
204
205        // Build lookup index for fast import resolution
206        let symbol_index = self.build_symbol_index(repo);
207
208        // Extract references from symbols using index
209        self.extract_references_fast(&mut graph, repo, &symbol_index);
210
211        // Compute PageRank once
212        let ranks = graph.compute_pagerank(0.85, 20); // Reduced iterations for speed
213
214        // Get top symbols using pre-computed ranks
215        let key_symbols = self.build_ranked_symbols_fast(&graph, &ranks);
216
217        // Build module graph
218        let module_graph = self.build_module_graph(repo);
219
220        // Build file index (with token budget enforcement based on symbol count)
221        let file_index = self.build_file_index(repo, key_symbols.len());
222
223        // Generate summary
224        let summary = self.generate_summary(repo, &key_symbols);
225
226        // Estimate token count
227        let token_count = self.estimate_tokens(&key_symbols, &file_index);
228
229        RepoMap { summary, key_symbols, module_graph, file_index, token_count }
230    }
231
232    /// Build an index of symbols for fast lookup
233    ///
234    /// Uses multi-value index to handle symbol name collisions across files.
235    /// For example, multiple files may have a `main` function - all are indexed.
236    fn build_symbol_index(&self, repo: &Repository) -> HashMap<String, Vec<String>> {
237        let mut index: HashMap<String, Vec<String>> = HashMap::new();
238        for file in &repo.files {
239            // Index by file path (without extension) - supports all 22 languages
240            let path_key = Self::strip_language_extension(&file.relative_path);
241
242            for symbol in &file.symbols {
243                let symbol_key = format!("{}:{}", file.relative_path, symbol.name);
244
245                // Index by symbol name (multi-value to handle collisions)
246                index
247                    .entry(symbol.name.clone())
248                    .or_default()
249                    .push(symbol_key.clone());
250
251                // Index by path component (only for the first symbol to maintain backwards compat)
252                // This allows import resolution like "from utils import foo" -> "utils.py:foo"
253                index
254                    .entry(path_key.to_owned())
255                    .or_default()
256                    .push(symbol_key);
257            }
258        }
259        index
260    }
261
262    /// Fast reference extraction using pre-built index
263    ///
264    /// Uses multi-value index to properly resolve all matching symbols when
265    /// there are name collisions across files.
266    fn extract_references_fast(
267        &self,
268        graph: &mut SymbolGraph,
269        repo: &Repository,
270        index: &HashMap<String, Vec<String>>,
271    ) {
272        for file in &repo.files {
273            for symbol in &file.symbols {
274                let from_key = format!("{}:{}", file.relative_path, symbol.name);
275
276                // Extract import references
277                if symbol.kind == SymbolKind::Import {
278                    // Fast lookup using index - connect to all matching symbols
279                    if let Some(targets) = index.get(&symbol.name) {
280                        for target in targets {
281                            // Don't create self-references
282                            if target != &from_key {
283                                graph.add_reference(&from_key, target, graph::EdgeType::Imports);
284                            }
285                        }
286                    }
287                }
288
289                // Extract inheritance relationships (extends)
290                if let Some(ref base_class) = symbol.extends {
291                    // Try fast lookup first, then fallback to name-based search
292                    if let Some(targets) = index.get(base_class) {
293                        for target in targets {
294                            if target != &from_key {
295                                graph.add_reference(&from_key, target, graph::EdgeType::Inherits);
296                            }
297                        }
298                    } else {
299                        graph.add_reference_by_name(
300                            &from_key,
301                            base_class,
302                            graph::EdgeType::Inherits,
303                        );
304                    }
305                }
306
307                // Extract interface/trait implementations
308                for iface in &symbol.implements {
309                    if let Some(targets) = index.get(iface) {
310                        for target in targets {
311                            if target != &from_key {
312                                graph.add_reference(&from_key, target, graph::EdgeType::Implements);
313                            }
314                        }
315                    } else {
316                        graph.add_reference_by_name(&from_key, iface, graph::EdgeType::Implements);
317                    }
318                }
319
320                // Extract function call relationships
321                for called_name in &symbol.calls {
322                    if let Some(targets) = index.get(called_name) {
323                        for target in targets {
324                            if target != &from_key {
325                                graph.add_reference(&from_key, target, graph::EdgeType::Calls);
326                            }
327                        }
328                    } else {
329                        graph.add_reference_by_name(&from_key, called_name, graph::EdgeType::Calls);
330                    }
331                }
332            }
333        }
334    }
335
336    /// Build ranked symbols using pre-computed ranks
337    fn build_ranked_symbols_fast(
338        &self,
339        graph: &SymbolGraph,
340        ranks: &HashMap<String, f64>,
341    ) -> Vec<RankedSymbol> {
342        let top_nodes = graph.get_top_symbols_with_ranks(ranks, self.effective_max_symbols());
343
344        top_nodes
345            .iter()
346            .enumerate()
347            .map(|(i, node)| {
348                let key = format!("{}:{}", node.file_path, node.symbol.name);
349                let rank_score = ranks.get(&key).copied().unwrap_or(0.0);
350
351                RankedSymbol {
352                    name: node.symbol.name.clone(),
353                    kind: node.symbol.kind.name().to_owned(),
354                    file: node.file_path.clone(),
355                    line: node.symbol.start_line,
356                    signature: node.symbol.signature.clone(),
357                    summary: summarize_symbol(&node.symbol),
358                    references: node.symbol.references,
359                    rank: (i + 1) as u32,
360                    importance: rank_score as f32,
361                }
362            })
363            .collect()
364    }
365
366    fn build_module_graph(&self, repo: &Repository) -> ModuleGraph {
367        let mut modules: HashMap<String, ModuleNode> = HashMap::new();
368        let mut edge_counts: HashMap<(String, String), u32> = HashMap::new();
369
370        // Build file index by module (first pass)
371        for file in &repo.files {
372            let module = file
373                .relative_path
374                .split('/')
375                .next()
376                .unwrap_or("root")
377                .to_owned();
378
379            let entry = modules.entry(module.clone()).or_insert(ModuleNode {
380                name: module.clone(),
381                files: 0,
382                tokens: 0,
383            });
384
385            entry.files += 1;
386            entry.tokens += file.token_count.get(self.model);
387        }
388
389        // Build a map of symbol names to their modules for cross-reference
390        let mut symbol_to_module: HashMap<String, String> = HashMap::new();
391        for file in &repo.files {
392            let module = file
393                .relative_path
394                .split('/')
395                .next()
396                .unwrap_or("root")
397                .to_owned();
398
399            for symbol in &file.symbols {
400                symbol_to_module.insert(symbol.name.clone(), module.clone());
401            }
402        }
403
404        // Compute edges based on imports and calls between modules
405        for file in &repo.files {
406            let from_module = file
407                .relative_path
408                .split('/')
409                .next()
410                .unwrap_or("root")
411                .to_owned();
412
413            for symbol in &file.symbols {
414                // Count import references to other modules
415                if symbol.kind == SymbolKind::Import {
416                    if let Some(target_module) = symbol_to_module.get(&symbol.name) {
417                        if target_module != &from_module {
418                            *edge_counts
419                                .entry((from_module.clone(), target_module.clone()))
420                                .or_insert(0) += 1;
421                        }
422                    }
423                }
424
425                // Count function calls to other modules
426                for called_name in &symbol.calls {
427                    if let Some(target_module) = symbol_to_module.get(called_name) {
428                        if target_module != &from_module {
429                            *edge_counts
430                                .entry((from_module.clone(), target_module.clone()))
431                                .or_insert(0) += 1;
432                        }
433                    }
434                }
435            }
436        }
437
438        // Convert edge counts to edges
439        let edges: Vec<ModuleEdge> = edge_counts
440            .into_iter()
441            .map(|((from, to), weight)| ModuleEdge { from, to, weight })
442            .collect();
443
444        ModuleGraph { nodes: modules.into_values().collect(), edges }
445    }
446
447    fn build_file_index(&self, repo: &Repository, symbol_count: usize) -> Vec<FileIndexEntry> {
448        let mut files: Vec<_> = repo
449            .files
450            .iter()
451            .map(|f| {
452                let importance = if f.importance > 0.8 {
453                    "critical"
454                } else if f.importance > 0.6 {
455                    "high"
456                } else if f.importance > 0.3 {
457                    "normal"
458                } else {
459                    "low"
460                };
461
462                FileIndexEntry {
463                    path: f.relative_path.clone(),
464                    tokens: f.token_count.get(self.model),
465                    importance: importance.to_owned(),
466                    summary: None,
467                }
468            })
469            .collect();
470
471        // Sort by importance
472        files.sort_by(|a, b| {
473            let a_imp = match a.importance.as_str() {
474                "critical" => 4,
475                "high" => 3,
476                "normal" => 2,
477                _ => 1,
478            };
479            let b_imp = match b.importance.as_str() {
480                "critical" => 4,
481                "high" => 3,
482                "normal" => 2,
483                _ => 1,
484            };
485            b_imp.cmp(&a_imp)
486        });
487
488        // Enforce token budget: limit file entries based on remaining budget
489        // Uses centralized constants for token estimation
490        let symbol_tokens = symbol_count as u32 * repomap_consts::TOKENS_PER_SYMBOL;
491        let remaining_budget = self
492            .token_budget
493            .saturating_sub(symbol_tokens)
494            .saturating_sub(repomap_consts::TOKEN_OVERHEAD);
495        let max_files = (remaining_budget / repomap_consts::TOKENS_PER_FILE) as usize;
496
497        if files.len() > max_files && max_files > 0 {
498            files.truncate(max_files);
499        }
500
501        files
502    }
503
504    fn generate_summary(&self, repo: &Repository, symbols: &[RankedSymbol]) -> String {
505        // Deduplicate modules - take top symbols until we have 3 unique modules
506        let mut seen = std::collections::HashSet::new();
507        let top_modules: Vec<_> = symbols
508            .iter()
509            .filter_map(|s| s.file.split('/').next())
510            .filter(|m| seen.insert(*m))
511            .take(3)
512            .collect();
513
514        let primary_lang = repo
515            .metadata
516            .languages
517            .first()
518            .map_or("unknown", |l| l.language.as_str());
519
520        format!(
521            "Repository: {} ({} files, {} lines)\n\
522             Primary language: {}\n\
523             Key modules: {}",
524            repo.name,
525            repo.metadata.total_files,
526            repo.metadata.total_lines,
527            primary_lang,
528            top_modules.join(", ")
529        )
530    }
531
532    fn estimate_tokens(&self, symbols: &[RankedSymbol], files: &[FileIndexEntry]) -> u32 {
533        // Uses centralized constants for token estimation
534        let symbol_tokens = symbols.len() as u32 * repomap_consts::TOKENS_PER_SYMBOL;
535        let file_tokens = files.len() as u32 * repomap_consts::TOKENS_PER_FILE;
536
537        symbol_tokens + file_tokens + repomap_consts::TOKEN_OVERHEAD
538    }
539
540    /// Strip language extension from a file path for indexing
541    /// Supports all 22 languages from the parser module
542    fn strip_language_extension(path: &str) -> &str {
543        // Extensions ordered by specificity (longer first to avoid partial matches)
544        const EXTENSIONS: &[&str] = &[
545            // Multi-char extensions first
546            ".gemspec",
547            ".fsscript",
548            // TypeScript/JavaScript
549            ".tsx",
550            ".jsx",
551            ".mjs",
552            ".cjs",
553            ".ts",
554            ".js",
555            // C/C++
556            ".cpp",
557            ".cxx",
558            ".hpp",
559            ".hxx",
560            ".cc",
561            ".hh",
562            ".c",
563            ".h",
564            // C#
565            ".cs",
566            // Ruby
567            ".rb",
568            ".rake",
569            // Shell
570            ".bash",
571            ".zsh",
572            ".fish",
573            ".sh",
574            // PHP
575            ".phtml",
576            ".php3",
577            ".php4",
578            ".php5",
579            ".phps",
580            ".php",
581            // Kotlin
582            ".kts",
583            ".kt",
584            // Swift
585            ".swift",
586            // Scala
587            ".scala",
588            ".sc",
589            // Haskell
590            ".lhs",
591            ".hs",
592            // Elixir
593            ".heex",
594            ".leex",
595            ".exs",
596            ".eex",
597            ".ex",
598            // Clojure
599            ".cljs",
600            ".cljc",
601            ".edn",
602            ".clj",
603            // OCaml
604            ".mli",
605            ".ml",
606            // F#
607            ".fsx",
608            ".fsi",
609            ".fs",
610            // Lua
611            ".lua",
612            // R
613            ".rmd",
614            ".r",
615            // Python
616            ".pyw",
617            ".py",
618            // Rust
619            ".rs",
620            // Go
621            ".go",
622            // Java
623            ".java",
624        ];
625
626        for ext in EXTENSIONS {
627            if let Some(stripped) = path.strip_suffix(ext) {
628                return stripped;
629            }
630        }
631        path
632    }
633}
634
635#[cfg(test)]
636#[allow(clippy::str_to_string)]
637mod tests {
638    use super::*;
639    use crate::types::{RepoMetadata, TokenCounts};
640
641    fn create_test_repo() -> Repository {
642        Repository {
643            name: "test-repo".to_owned(),
644            path: "/tmp/test".into(),
645            files: vec![RepoFile {
646                path: "/tmp/test/src/main.py".into(),
647                relative_path: "src/main.py".to_string(),
648                language: Some("python".to_string()),
649                size_bytes: 1000,
650                token_count: TokenCounts {
651                    o200k: 240,
652                    cl100k: 245,
653                    claude: 250,
654                    gemini: 230,
655                    llama: 235,
656                    mistral: 235,
657                    deepseek: 235,
658                    qwen: 235,
659                    cohere: 238,
660                    grok: 235,
661                },
662                symbols: vec![Symbol {
663                    name: "main".to_string(),
664                    kind: SymbolKind::Function,
665                    signature: Some("def main() -> None".to_string()),
666                    docstring: Some("Entry point".to_string()),
667                    start_line: 10,
668                    end_line: 25,
669                    references: 5,
670                    importance: 0.9,
671                    parent: None,
672                    visibility: crate::types::Visibility::Public,
673                    calls: vec!["helper".to_string(), "process".to_string()],
674                    extends: None,
675                    implements: vec![],
676                }],
677                importance: 0.9,
678                content: None,
679            }],
680            metadata: RepoMetadata {
681                total_files: 1,
682                total_lines: 100,
683                total_tokens: TokenCounts {
684                    o200k: 240,
685                    cl100k: 245,
686                    claude: 250,
687                    gemini: 230,
688                    llama: 235,
689                    mistral: 235,
690                    deepseek: 235,
691                    qwen: 235,
692                    cohere: 238,
693                    grok: 235,
694                },
695                languages: vec![crate::types::LanguageStats {
696                    language: "Python".to_string(),
697                    files: 1,
698                    lines: 100,
699                    percentage: 100.0,
700                }],
701                framework: None,
702                description: None,
703                branch: None,
704                commit: None,
705                directory_structure: None,
706                external_dependencies: vec![],
707                git_history: None,
708            },
709        }
710    }
711
712    #[test]
713    fn test_generate_repomap() {
714        let repo = create_test_repo();
715        let generator = RepoMapGenerator::new(2000);
716        let map = generator.generate(&repo);
717
718        assert!(!map.summary.is_empty());
719        assert!(!map.file_index.is_empty());
720    }
721
722    // Bug #7 test - verify mapBudget has significant effect
723    #[test]
724    fn test_map_budget_affects_output() {
725        // Test that different budgets produce different effective_max_symbols
726        let small = RepoMapGenerator::new(500);
727        let medium = RepoMapGenerator::new(2000);
728        let large = RepoMapGenerator::new(10000);
729
730        let small_max = small.effective_max_symbols();
731        let medium_max = medium.effective_max_symbols();
732        let large_max = large.effective_max_symbols();
733
734        // Small budget should have fewer max symbols
735        assert!(
736            small_max < medium_max,
737            "Small budget ({}) should have fewer symbols ({}) than medium ({}) with ({})",
738            500,
739            small_max,
740            2000,
741            medium_max
742        );
743
744        // Large budget should have more max symbols
745        assert!(
746            medium_max < large_max,
747            "Medium budget ({}) should have fewer symbols ({}) than large ({}) with ({})",
748            2000,
749            medium_max,
750            10000,
751            large_max
752        );
753
754        // Verify the actual values are reasonable
755        // With BUDGET_SYMBOL_DIVISOR = 20:
756        // 500 / 20 = 25
757        // 2000 / 20 = 100
758        // 10000 / 20 = 500
759        assert!(
760            (20..=30).contains(&small_max),
761            "Small max_symbols should be ~25, got {}",
762            small_max
763        );
764        assert!(
765            (90..=110).contains(&medium_max),
766            "Medium max_symbols should be ~100, got {}",
767            medium_max
768        );
769        assert!(
770            (400..=500).contains(&large_max),
771            "Large max_symbols should be ~500, got {}",
772            large_max
773        );
774    }
775
776    #[test]
777    fn test_map_budget_min_max_clamped() {
778        use crate::constants::repomap as consts;
779
780        // Very small budget should clamp to minimum
781        let tiny = RepoMapGenerator::new(10);
782        assert_eq!(tiny.effective_max_symbols(), consts::MIN_SYMBOLS);
783
784        // Very large budget should clamp to maximum
785        let huge = RepoMapGenerator::new(100_000);
786        assert_eq!(huge.effective_max_symbols(), consts::MAX_SYMBOLS);
787    }
788}