infiniloom_engine/repomap/
mod.rs

1//! Repository map generation with PageRank-based symbol ranking
2
3mod graph;
4
5use crate::constants::repomap as repomap_consts;
6
7fn summarize_symbol(symbol: &Symbol) -> Option<String> {
8    let line = symbol
9        .docstring
10        .as_deref()
11        .and_then(first_nonempty_line)
12        .or_else(|| symbol.signature.as_deref().and_then(first_nonempty_line));
13
14    line.map(|text| truncate_summary(text, repomap_consts::SUMMARY_MAX_LEN))
15}
16
17fn first_nonempty_line(text: &str) -> Option<&str> {
18    text.lines()
19        .map(|line| line.trim())
20        .find(|line| !line.is_empty())
21}
22
23fn truncate_summary(text: &str, max_len: usize) -> String {
24    if text.chars().count() <= max_len {
25        return text.to_owned();
26    }
27
28    let mut truncated: String = text.chars().take(max_len).collect();
29    truncated = truncated.trim_end().to_owned();
30    truncated.push_str("...");
31    truncated
32}
33
34#[cfg(test)]
35use crate::types::RepoFile;
36use crate::types::{Repository, Symbol, SymbolKind, TokenizerModel};
37use graph::SymbolGraph;
38use serde::Serialize;
39use std::collections::HashMap;
40use std::fmt;
41use typed_builder::TypedBuilder;
42
43/// A repository map - a concise summary of the codebase
44#[derive(Debug, Clone, Serialize)]
45pub struct RepoMap {
46    /// Text summary of the repository
47    pub summary: String,
48    /// Most important symbols ranked by PageRank
49    pub key_symbols: Vec<RankedSymbol>,
50    /// Module/directory dependency graph
51    pub module_graph: ModuleGraph,
52    /// Index of all files with metadata
53    pub file_index: Vec<FileIndexEntry>,
54    /// Total token count for this map
55    pub token_count: u32,
56}
57
58impl fmt::Display for RepoMap {
59    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
60        write!(
61            f,
62            "RepoMap({} symbols, {} files, {} tokens)",
63            self.key_symbols.len(),
64            self.file_index.len(),
65            self.token_count
66        )
67    }
68}
69
70/// A symbol with its computed rank
71#[derive(Debug, Clone, Serialize)]
72pub struct RankedSymbol {
73    /// Symbol name
74    pub name: String,
75    /// Symbol kind
76    pub kind: String,
77    /// File containing the symbol
78    pub file: String,
79    /// Line number
80    pub line: u32,
81    /// Function/method signature
82    pub signature: Option<String>,
83    /// Short summary (docstring or signature)
84    pub summary: Option<String>,
85    /// Number of references
86    pub references: u32,
87    /// Rank (1 = most important)
88    pub rank: u32,
89    /// Importance score (0.0 - 1.0)
90    pub importance: f32,
91}
92
93/// Graph of module dependencies
94#[derive(Debug, Clone, Serialize)]
95pub struct ModuleGraph {
96    /// Module nodes
97    pub nodes: Vec<ModuleNode>,
98    /// Dependency edges
99    pub edges: Vec<ModuleEdge>,
100}
101
102/// A module/directory node
103#[derive(Debug, Clone, Serialize)]
104pub struct ModuleNode {
105    /// Module name (usually directory name)
106    pub name: String,
107    /// Number of files in module
108    pub files: u32,
109    /// Total tokens in module
110    pub tokens: u32,
111}
112
113/// A dependency edge between modules
114#[derive(Debug, Clone, Serialize)]
115pub struct ModuleEdge {
116    /// Source module
117    pub from: String,
118    /// Target module
119    pub to: String,
120    /// Number of imports/references
121    pub weight: u32,
122}
123
124/// File index entry
125#[derive(Debug, Clone, Serialize)]
126pub struct FileIndexEntry {
127    /// Relative file path
128    pub path: String,
129    /// Token count
130    pub tokens: u32,
131    /// Importance level (critical/high/normal/low)
132    pub importance: String,
133    /// Brief summary (optional)
134    pub summary: Option<String>,
135}
136
137/// Generator for repository maps
138///
139/// # Examples
140///
141/// ```rust,ignore
142/// use infiniloom_engine::{RepoMapGenerator, TokenizerModel};
143///
144/// // Use builder pattern
145/// let generator = RepoMapGenerator::builder()
146///     .token_budget(50_000)
147///     .model(TokenizerModel::Gpt4o)
148///     .build();
149///
150/// // Or use the convenience constructor
151/// let generator = RepoMapGenerator::new(2000);
152/// ```
153#[derive(Debug, Clone, TypedBuilder)]
154pub struct RepoMapGenerator {
155    /// Maximum number of symbols to include (default: computed from budget)
156    #[builder(default, setter(strip_option))]
157    max_symbols: Option<usize>,
158
159    /// Target model for token counting (default: Claude)
160    #[builder(default = TokenizerModel::Claude)]
161    model: TokenizerModel,
162
163    /// Token budget for the map (default: 2000)
164    #[builder(default = 2000)]
165    token_budget: u32,
166}
167
168impl RepoMapGenerator {
169    /// Create a new generator with token budget
170    /// The token budget influences how many symbols are included in the map.
171    /// Approximately 25 tokens per symbol entry, so max_symbols = budget / 30 (with overhead)
172    pub fn new(token_budget: u32) -> Self {
173        Self { max_symbols: None, model: TokenizerModel::Claude, token_budget }
174    }
175
176    /// Get effective max symbols (computed from budget if not set)
177    ///
178    /// Bug #7 fix: Increased max cap from 200 to 500 and made formula more responsive
179    fn effective_max_symbols(&self) -> usize {
180        self.max_symbols.unwrap_or_else(|| {
181            // Use constants for token estimation (see module-level documentation)
182            // Bug #7 fix: Increased max from 200 to 500 for larger budgets
183            ((self.token_budget as usize) / repomap_consts::BUDGET_SYMBOL_DIVISOR)
184                .clamp(repomap_consts::MIN_SYMBOLS, repomap_consts::MAX_SYMBOLS)
185        })
186    }
187
188    /// Generate a repository map with PageRank-based symbol ranking.
189    ///
190    /// The generated map includes:
191    /// - Key symbols ranked by importance (PageRank algorithm)
192    /// - Module dependency graph
193    /// - File index with importance levels
194    /// - Summary statistics
195    ///
196    /// The map is optimized for the configured token budget.
197    #[must_use = "repository map should be used or formatted"]
198    pub fn generate(&self, repo: &Repository) -> RepoMap {
199        // Build symbol graph
200        let mut graph = SymbolGraph::new();
201        for file in &repo.files {
202            graph.add_file(file);
203        }
204
205        // Build lookup index for fast import resolution
206        let symbol_index = self.build_symbol_index(repo);
207
208        // Extract references from symbols using index
209        self.extract_references_fast(&mut graph, repo, &symbol_index);
210
211        // Compute PageRank once
212        let ranks = graph.compute_pagerank(0.85, 20); // Reduced iterations for speed
213
214        // Get top symbols using pre-computed ranks
215        let key_symbols = self.build_ranked_symbols_fast(&graph, &ranks);
216
217        // Build module graph
218        let module_graph = self.build_module_graph(repo);
219
220        // Build file index (with token budget enforcement based on symbol count)
221        let file_index = self.build_file_index(repo, key_symbols.len());
222
223        // Generate summary
224        let summary = self.generate_summary(repo, &key_symbols);
225
226        // Estimate token count
227        let token_count = self.estimate_tokens(&key_symbols, &file_index);
228
229        RepoMap { summary, key_symbols, module_graph, file_index, token_count }
230    }
231
232    /// Build an index of symbols for fast lookup
233    ///
234    /// Uses multi-value index to handle symbol name collisions across files.
235    /// For example, multiple files may have a `main` function - all are indexed.
236    fn build_symbol_index(&self, repo: &Repository) -> HashMap<String, Vec<String>> {
237        let mut index: HashMap<String, Vec<String>> = HashMap::new();
238        for file in &repo.files {
239            // Index by file path (without extension) - supports all 22 languages
240            let path_key = Self::strip_language_extension(&file.relative_path);
241
242            for symbol in &file.symbols {
243                let symbol_key = format!("{}:{}", file.relative_path, symbol.name);
244
245                // Index by symbol name (multi-value to handle collisions)
246                index
247                    .entry(symbol.name.clone())
248                    .or_default()
249                    .push(symbol_key.clone());
250
251                // Index by path component (only for the first symbol to maintain backwards compat)
252                // This allows import resolution like "from utils import foo" -> "utils.py:foo"
253                index
254                    .entry(path_key.to_owned())
255                    .or_default()
256                    .push(symbol_key);
257            }
258        }
259        index
260    }
261
262    /// Fast reference extraction using pre-built index
263    ///
264    /// Uses multi-value index to properly resolve all matching symbols when
265    /// there are name collisions across files.
266    fn extract_references_fast(
267        &self,
268        graph: &mut SymbolGraph,
269        repo: &Repository,
270        index: &HashMap<String, Vec<String>>,
271    ) {
272        for file in &repo.files {
273            for symbol in &file.symbols {
274                let from_key = format!("{}:{}", file.relative_path, symbol.name);
275
276                // Extract import references
277                if symbol.kind == SymbolKind::Import {
278                    // Fast lookup using index - connect to all matching symbols
279                    if let Some(targets) = index.get(&symbol.name) {
280                        for target in targets {
281                            // Don't create self-references
282                            if target != &from_key {
283                                graph.add_reference(&from_key, target, graph::EdgeType::Imports);
284                            }
285                        }
286                    }
287                }
288
289                // Extract inheritance relationships (extends)
290                if let Some(ref base_class) = symbol.extends {
291                    // Try fast lookup first, then fallback to name-based search
292                    if let Some(targets) = index.get(base_class) {
293                        for target in targets {
294                            if target != &from_key {
295                                graph.add_reference(&from_key, target, graph::EdgeType::Inherits);
296                            }
297                        }
298                    } else {
299                        graph.add_reference_by_name(
300                            &from_key,
301                            base_class,
302                            graph::EdgeType::Inherits,
303                        );
304                    }
305                }
306
307                // Extract interface/trait implementations
308                for iface in &symbol.implements {
309                    if let Some(targets) = index.get(iface) {
310                        for target in targets {
311                            if target != &from_key {
312                                graph.add_reference(&from_key, target, graph::EdgeType::Implements);
313                            }
314                        }
315                    } else {
316                        graph.add_reference_by_name(&from_key, iface, graph::EdgeType::Implements);
317                    }
318                }
319
320                // Extract function call relationships
321                for called_name in &symbol.calls {
322                    if let Some(targets) = index.get(called_name) {
323                        for target in targets {
324                            if target != &from_key {
325                                graph.add_reference(&from_key, target, graph::EdgeType::Calls);
326                            }
327                        }
328                    } else {
329                        graph.add_reference_by_name(&from_key, called_name, graph::EdgeType::Calls);
330                    }
331                }
332            }
333        }
334    }
335
336    /// Build ranked symbols using pre-computed ranks
337    fn build_ranked_symbols_fast(
338        &self,
339        graph: &SymbolGraph,
340        ranks: &HashMap<String, f64>,
341    ) -> Vec<RankedSymbol> {
342        let top_nodes = graph.get_top_symbols_with_ranks(ranks, self.effective_max_symbols());
343
344        top_nodes
345            .iter()
346            .enumerate()
347            .map(|(i, node)| {
348                let key = format!("{}:{}", node.file_path, node.symbol.name);
349                let rank_score = ranks.get(&key).copied().unwrap_or(0.0);
350
351                RankedSymbol {
352                    name: node.symbol.name.clone(),
353                    kind: node.symbol.kind.name().to_owned(),
354                    file: node.file_path.clone(),
355                    line: node.symbol.start_line,
356                    signature: node.symbol.signature.clone(),
357                    summary: summarize_symbol(&node.symbol),
358                    references: node.symbol.references,
359                    rank: (i + 1) as u32,
360                    importance: rank_score as f32,
361                }
362            })
363            .collect()
364    }
365
366    fn build_module_graph(&self, repo: &Repository) -> ModuleGraph {
367        let mut modules: HashMap<String, ModuleNode> = HashMap::new();
368        let mut edge_counts: HashMap<(String, String), u32> = HashMap::new();
369
370        // Build file index by module (first pass)
371        for file in &repo.files {
372            let module = file
373                .relative_path
374                .split('/')
375                .next()
376                .unwrap_or("root")
377                .to_owned();
378
379            let entry = modules.entry(module.clone()).or_insert(ModuleNode {
380                name: module.clone(),
381                files: 0,
382                tokens: 0,
383            });
384
385            entry.files += 1;
386            entry.tokens += file.token_count.get(self.model);
387        }
388
389        // Build a map of symbol names to their modules for cross-reference
390        let mut symbol_to_module: HashMap<String, String> = HashMap::new();
391        for file in &repo.files {
392            let module = file
393                .relative_path
394                .split('/')
395                .next()
396                .unwrap_or("root")
397                .to_owned();
398
399            for symbol in &file.symbols {
400                symbol_to_module.insert(symbol.name.clone(), module.clone());
401            }
402        }
403
404        // Compute edges based on imports and calls between modules
405        for file in &repo.files {
406            let from_module = file
407                .relative_path
408                .split('/')
409                .next()
410                .unwrap_or("root")
411                .to_owned();
412
413            for symbol in &file.symbols {
414                // Count import references to other modules
415                if symbol.kind == SymbolKind::Import {
416                    if let Some(target_module) = symbol_to_module.get(&symbol.name) {
417                        if target_module != &from_module {
418                            *edge_counts
419                                .entry((from_module.clone(), target_module.clone()))
420                                .or_insert(0) += 1;
421                        }
422                    }
423                }
424
425                // Count function calls to other modules
426                for called_name in &symbol.calls {
427                    if let Some(target_module) = symbol_to_module.get(called_name) {
428                        if target_module != &from_module {
429                            *edge_counts
430                                .entry((from_module.clone(), target_module.clone()))
431                                .or_insert(0) += 1;
432                        }
433                    }
434                }
435            }
436        }
437
438        // Convert edge counts to edges
439        let edges: Vec<ModuleEdge> = edge_counts
440            .into_iter()
441            .map(|((from, to), weight)| ModuleEdge { from, to, weight })
442            .collect();
443
444        ModuleGraph { nodes: modules.into_values().collect(), edges }
445    }
446
447    fn build_file_index(&self, repo: &Repository, symbol_count: usize) -> Vec<FileIndexEntry> {
448        let mut files: Vec<_> = repo
449            .files
450            .iter()
451            .map(|f| {
452                let importance = if f.importance > 0.8 {
453                    "critical"
454                } else if f.importance > 0.6 {
455                    "high"
456                } else if f.importance > 0.3 {
457                    "normal"
458                } else {
459                    "low"
460                };
461
462                FileIndexEntry {
463                    path: f.relative_path.clone(),
464                    tokens: f.token_count.get(self.model),
465                    importance: importance.to_owned(),
466                    summary: None,
467                }
468            })
469            .collect();
470
471        // Sort by importance
472        files.sort_by(|a, b| {
473            let a_imp = match a.importance.as_str() {
474                "critical" => 4,
475                "high" => 3,
476                "normal" => 2,
477                _ => 1,
478            };
479            let b_imp = match b.importance.as_str() {
480                "critical" => 4,
481                "high" => 3,
482                "normal" => 2,
483                _ => 1,
484            };
485            b_imp.cmp(&a_imp)
486        });
487
488        // Enforce token budget: limit file entries based on remaining budget
489        // Uses centralized constants for token estimation
490        let symbol_tokens = symbol_count as u32 * repomap_consts::TOKENS_PER_SYMBOL;
491        let remaining_budget = self
492            .token_budget
493            .saturating_sub(symbol_tokens)
494            .saturating_sub(repomap_consts::TOKEN_OVERHEAD);
495        let max_files = (remaining_budget / repomap_consts::TOKENS_PER_FILE) as usize;
496
497        if files.len() > max_files && max_files > 0 {
498            files.truncate(max_files);
499        }
500
501        files
502    }
503
504    fn generate_summary(&self, repo: &Repository, symbols: &[RankedSymbol]) -> String {
505        // Deduplicate modules - take top symbols until we have 3 unique modules
506        let mut seen = std::collections::HashSet::new();
507        let top_modules: Vec<_> = symbols
508            .iter()
509            .filter_map(|s| s.file.split('/').next())
510            .filter(|m| seen.insert(*m))
511            .take(3)
512            .collect();
513
514        let primary_lang = repo
515            .metadata
516            .languages
517            .first()
518            .map(|l| l.language.as_str())
519            .unwrap_or("unknown");
520
521        format!(
522            "Repository: {} ({} files, {} lines)\n\
523             Primary language: {}\n\
524             Key modules: {}",
525            repo.name,
526            repo.metadata.total_files,
527            repo.metadata.total_lines,
528            primary_lang,
529            top_modules.join(", ")
530        )
531    }
532
533    fn estimate_tokens(&self, symbols: &[RankedSymbol], files: &[FileIndexEntry]) -> u32 {
534        // Uses centralized constants for token estimation
535        let symbol_tokens = symbols.len() as u32 * repomap_consts::TOKENS_PER_SYMBOL;
536        let file_tokens = files.len() as u32 * repomap_consts::TOKENS_PER_FILE;
537
538        symbol_tokens + file_tokens + repomap_consts::TOKEN_OVERHEAD
539    }
540
541    /// Strip language extension from a file path for indexing
542    /// Supports all 22 languages from the parser module
543    fn strip_language_extension(path: &str) -> &str {
544        // Extensions ordered by specificity (longer first to avoid partial matches)
545        const EXTENSIONS: &[&str] = &[
546            // Multi-char extensions first
547            ".gemspec",
548            ".fsscript",
549            // TypeScript/JavaScript
550            ".tsx",
551            ".jsx",
552            ".mjs",
553            ".cjs",
554            ".ts",
555            ".js",
556            // C/C++
557            ".cpp",
558            ".cxx",
559            ".hpp",
560            ".hxx",
561            ".cc",
562            ".hh",
563            ".c",
564            ".h",
565            // C#
566            ".cs",
567            // Ruby
568            ".rb",
569            ".rake",
570            // Shell
571            ".bash",
572            ".zsh",
573            ".fish",
574            ".sh",
575            // PHP
576            ".phtml",
577            ".php3",
578            ".php4",
579            ".php5",
580            ".phps",
581            ".php",
582            // Kotlin
583            ".kts",
584            ".kt",
585            // Swift
586            ".swift",
587            // Scala
588            ".scala",
589            ".sc",
590            // Haskell
591            ".lhs",
592            ".hs",
593            // Elixir
594            ".heex",
595            ".leex",
596            ".exs",
597            ".eex",
598            ".ex",
599            // Clojure
600            ".cljs",
601            ".cljc",
602            ".edn",
603            ".clj",
604            // OCaml
605            ".mli",
606            ".ml",
607            // F#
608            ".fsx",
609            ".fsi",
610            ".fs",
611            // Lua
612            ".lua",
613            // R
614            ".rmd",
615            ".r",
616            // Python
617            ".pyw",
618            ".py",
619            // Rust
620            ".rs",
621            // Go
622            ".go",
623            // Java
624            ".java",
625        ];
626
627        for ext in EXTENSIONS {
628            if let Some(stripped) = path.strip_suffix(ext) {
629                return stripped;
630            }
631        }
632        path
633    }
634}
635
636#[cfg(test)]
637#[allow(clippy::str_to_string)]
638mod tests {
639    use super::*;
640    use crate::types::{RepoMetadata, TokenCounts};
641
642    fn create_test_repo() -> Repository {
643        Repository {
644            name: "test-repo".to_owned(),
645            path: "/tmp/test".into(),
646            files: vec![RepoFile {
647                path: "/tmp/test/src/main.py".into(),
648                relative_path: "src/main.py".to_string(),
649                language: Some("python".to_string()),
650                size_bytes: 1000,
651                token_count: TokenCounts {
652                    o200k: 240,
653                    cl100k: 245,
654                    claude: 250,
655                    gemini: 230,
656                    llama: 235,
657                    mistral: 235,
658                    deepseek: 235,
659                    qwen: 235,
660                    cohere: 238,
661                    grok: 235,
662                },
663                symbols: vec![Symbol {
664                    name: "main".to_string(),
665                    kind: SymbolKind::Function,
666                    signature: Some("def main() -> None".to_string()),
667                    docstring: Some("Entry point".to_string()),
668                    start_line: 10,
669                    end_line: 25,
670                    references: 5,
671                    importance: 0.9,
672                    parent: None,
673                    visibility: crate::types::Visibility::Public,
674                    calls: vec!["helper".to_string(), "process".to_string()],
675                    extends: None,
676                    implements: vec![],
677                }],
678                importance: 0.9,
679                content: None,
680            }],
681            metadata: RepoMetadata {
682                total_files: 1,
683                total_lines: 100,
684                total_tokens: TokenCounts {
685                    o200k: 240,
686                    cl100k: 245,
687                    claude: 250,
688                    gemini: 230,
689                    llama: 235,
690                    mistral: 235,
691                    deepseek: 235,
692                    qwen: 235,
693                    cohere: 238,
694                    grok: 235,
695                },
696                languages: vec![crate::types::LanguageStats {
697                    language: "Python".to_string(),
698                    files: 1,
699                    lines: 100,
700                    percentage: 100.0,
701                }],
702                framework: None,
703                description: None,
704                branch: None,
705                commit: None,
706                directory_structure: None,
707                external_dependencies: vec![],
708                git_history: None,
709            },
710        }
711    }
712
713    #[test]
714    fn test_generate_repomap() {
715        let repo = create_test_repo();
716        let generator = RepoMapGenerator::new(2000);
717        let map = generator.generate(&repo);
718
719        assert!(!map.summary.is_empty());
720        assert!(!map.file_index.is_empty());
721    }
722
723    // Bug #7 test - verify mapBudget has significant effect
724    #[test]
725    fn test_map_budget_affects_output() {
726        // Test that different budgets produce different effective_max_symbols
727        let small = RepoMapGenerator::new(500);
728        let medium = RepoMapGenerator::new(2000);
729        let large = RepoMapGenerator::new(10000);
730
731        let small_max = small.effective_max_symbols();
732        let medium_max = medium.effective_max_symbols();
733        let large_max = large.effective_max_symbols();
734
735        // Small budget should have fewer max symbols
736        assert!(
737            small_max < medium_max,
738            "Small budget ({}) should have fewer symbols ({}) than medium ({}) with ({})",
739            500,
740            small_max,
741            2000,
742            medium_max
743        );
744
745        // Large budget should have more max symbols
746        assert!(
747            medium_max < large_max,
748            "Medium budget ({}) should have fewer symbols ({}) than large ({}) with ({})",
749            2000,
750            medium_max,
751            10000,
752            large_max
753        );
754
755        // Verify the actual values are reasonable
756        // With BUDGET_SYMBOL_DIVISOR = 20:
757        // 500 / 20 = 25
758        // 2000 / 20 = 100
759        // 10000 / 20 = 500
760        assert!(
761            (20..=30).contains(&small_max),
762            "Small max_symbols should be ~25, got {}",
763            small_max
764        );
765        assert!(
766            (90..=110).contains(&medium_max),
767            "Medium max_symbols should be ~100, got {}",
768            medium_max
769        );
770        assert!(
771            (400..=500).contains(&large_max),
772            "Large max_symbols should be ~500, got {}",
773            large_max
774        );
775    }
776
777    #[test]
778    fn test_map_budget_min_max_clamped() {
779        use crate::constants::repomap as consts;
780
781        // Very small budget should clamp to minimum
782        let tiny = RepoMapGenerator::new(10);
783        assert_eq!(tiny.effective_max_symbols(), consts::MIN_SYMBOLS);
784
785        // Very large budget should clamp to maximum
786        let huge = RepoMapGenerator::new(100_000);
787        assert_eq!(huge.effective_max_symbols(), consts::MAX_SYMBOLS);
788    }
789}