infiniloom_engine/repomap/
mod.rs

1//! Repository map generation with PageRank-based symbol ranking
2
3mod graph;
4
5// ============================================================================
6// Token estimation constants for budget calculations
7// ============================================================================
8// These values are derived from empirical analysis of typical repository maps:
9// - Symbol entries include name, kind, file, line, signature (~25 tokens avg)
10// - File entries include path, tokens, importance (~10 tokens avg)
11// - Overhead covers headers, summary text, formatting (~100 tokens)
12
13/// Estimated tokens per symbol entry in the repository map
14const TOKENS_PER_SYMBOL: u32 = 25;
15
16/// Estimated tokens per file entry in the file index
17const TOKENS_PER_FILE: u32 = 10;
18
19/// Fixed token overhead for headers, summary, and formatting
20const TOKEN_OVERHEAD: u32 = 100;
21
22/// Divisor for computing max symbols from budget (includes safety margin)
23/// Formula: max_symbols = budget / BUDGET_SYMBOL_DIVISOR
24const BUDGET_SYMBOL_DIVISOR: usize = 30;
25const SUMMARY_MAX_LEN: usize = 120;
26
27fn summarize_symbol(symbol: &Symbol) -> Option<String> {
28    let line = symbol
29        .docstring
30        .as_deref()
31        .and_then(first_nonempty_line)
32        .or_else(|| symbol.signature.as_deref().and_then(first_nonempty_line));
33
34    line.map(|text| truncate_summary(text, SUMMARY_MAX_LEN))
35}
36
37fn first_nonempty_line(text: &str) -> Option<&str> {
38    text.lines()
39        .map(|line| line.trim())
40        .find(|line| !line.is_empty())
41}
42
43fn truncate_summary(text: &str, max_len: usize) -> String {
44    if text.chars().count() <= max_len {
45        return text.to_owned();
46    }
47
48    let mut truncated: String = text.chars().take(max_len).collect();
49    truncated = truncated.trim_end().to_owned();
50    truncated.push_str("...");
51    truncated
52}
53
54#[cfg(test)]
55use crate::types::RepoFile;
56use crate::types::{Repository, Symbol, SymbolKind, TokenizerModel};
57use graph::SymbolGraph;
58use serde::Serialize;
59use std::collections::HashMap;
60use std::fmt;
61use typed_builder::TypedBuilder;
62
63/// A repository map - a concise summary of the codebase
64#[derive(Debug, Clone, Serialize)]
65pub struct RepoMap {
66    /// Text summary of the repository
67    pub summary: String,
68    /// Most important symbols ranked by PageRank
69    pub key_symbols: Vec<RankedSymbol>,
70    /// Module/directory dependency graph
71    pub module_graph: ModuleGraph,
72    /// Index of all files with metadata
73    pub file_index: Vec<FileIndexEntry>,
74    /// Total token count for this map
75    pub token_count: u32,
76}
77
78impl fmt::Display for RepoMap {
79    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
80        write!(
81            f,
82            "RepoMap({} symbols, {} files, {} tokens)",
83            self.key_symbols.len(),
84            self.file_index.len(),
85            self.token_count
86        )
87    }
88}
89
90/// A symbol with its computed rank
91#[derive(Debug, Clone, Serialize)]
92pub struct RankedSymbol {
93    /// Symbol name
94    pub name: String,
95    /// Symbol kind
96    pub kind: String,
97    /// File containing the symbol
98    pub file: String,
99    /// Line number
100    pub line: u32,
101    /// Function/method signature
102    pub signature: Option<String>,
103    /// Short summary (docstring or signature)
104    pub summary: Option<String>,
105    /// Number of references
106    pub references: u32,
107    /// Rank (1 = most important)
108    pub rank: u32,
109    /// Importance score (0.0 - 1.0)
110    pub importance: f32,
111}
112
113/// Graph of module dependencies
114#[derive(Debug, Clone, Serialize)]
115pub struct ModuleGraph {
116    /// Module nodes
117    pub nodes: Vec<ModuleNode>,
118    /// Dependency edges
119    pub edges: Vec<ModuleEdge>,
120}
121
122/// A module/directory node
123#[derive(Debug, Clone, Serialize)]
124pub struct ModuleNode {
125    /// Module name (usually directory name)
126    pub name: String,
127    /// Number of files in module
128    pub files: u32,
129    /// Total tokens in module
130    pub tokens: u32,
131}
132
133/// A dependency edge between modules
134#[derive(Debug, Clone, Serialize)]
135pub struct ModuleEdge {
136    /// Source module
137    pub from: String,
138    /// Target module
139    pub to: String,
140    /// Number of imports/references
141    pub weight: u32,
142}
143
144/// File index entry
145#[derive(Debug, Clone, Serialize)]
146pub struct FileIndexEntry {
147    /// Relative file path
148    pub path: String,
149    /// Token count
150    pub tokens: u32,
151    /// Importance level (critical/high/normal/low)
152    pub importance: String,
153    /// Brief summary (optional)
154    pub summary: Option<String>,
155}
156
157/// Generator for repository maps
158///
159/// # Examples
160///
161/// ```rust,ignore
162/// use infiniloom_engine::{RepoMapGenerator, TokenizerModel};
163///
164/// // Use builder pattern
165/// let generator = RepoMapGenerator::builder()
166///     .token_budget(50_000)
167///     .model(TokenizerModel::Gpt4o)
168///     .build();
169///
170/// // Or use the convenience constructor
171/// let generator = RepoMapGenerator::new(2000);
172/// ```
173#[derive(Debug, Clone, TypedBuilder)]
174pub struct RepoMapGenerator {
175    /// Maximum number of symbols to include (default: computed from budget)
176    #[builder(default, setter(strip_option))]
177    max_symbols: Option<usize>,
178
179    /// Target model for token counting (default: Claude)
180    #[builder(default = TokenizerModel::Claude)]
181    model: TokenizerModel,
182
183    /// Token budget for the map (default: 2000)
184    #[builder(default = 2000)]
185    token_budget: u32,
186}
187
188impl RepoMapGenerator {
189    /// Create a new generator with token budget
190    /// The token budget influences how many symbols are included in the map.
191    /// Approximately 25 tokens per symbol entry, so max_symbols = budget / 30 (with overhead)
192    pub fn new(token_budget: u32) -> Self {
193        Self { max_symbols: None, model: TokenizerModel::Claude, token_budget }
194    }
195
196    /// Get effective max symbols (computed from budget if not set)
197    fn effective_max_symbols(&self) -> usize {
198        self.max_symbols.unwrap_or_else(|| {
199            // Use constants for token estimation (see module-level documentation)
200            ((self.token_budget as usize) / BUDGET_SYMBOL_DIVISOR).clamp(10, 200)
201        })
202    }
203
204    /// Generate a repository map with PageRank-based symbol ranking.
205    ///
206    /// The generated map includes:
207    /// - Key symbols ranked by importance (PageRank algorithm)
208    /// - Module dependency graph
209    /// - File index with importance levels
210    /// - Summary statistics
211    ///
212    /// The map is optimized for the configured token budget.
213    #[must_use = "repository map should be used or formatted"]
214    pub fn generate(&self, repo: &Repository) -> RepoMap {
215        // Build symbol graph
216        let mut graph = SymbolGraph::new();
217        for file in &repo.files {
218            graph.add_file(file);
219        }
220
221        // Build lookup index for fast import resolution
222        let symbol_index = self.build_symbol_index(repo);
223
224        // Extract references from symbols using index
225        self.extract_references_fast(&mut graph, repo, &symbol_index);
226
227        // Compute PageRank once
228        let ranks = graph.compute_pagerank(0.85, 20); // Reduced iterations for speed
229
230        // Get top symbols using pre-computed ranks
231        let key_symbols = self.build_ranked_symbols_fast(&graph, &ranks);
232
233        // Build module graph
234        let module_graph = self.build_module_graph(repo);
235
236        // Build file index (with token budget enforcement based on symbol count)
237        let file_index = self.build_file_index(repo, key_symbols.len());
238
239        // Generate summary
240        let summary = self.generate_summary(repo, &key_symbols);
241
242        // Estimate token count
243        let token_count = self.estimate_tokens(&key_symbols, &file_index);
244
245        RepoMap { summary, key_symbols, module_graph, file_index, token_count }
246    }
247
248    /// Build an index of symbols for fast lookup
249    ///
250    /// Uses multi-value index to handle symbol name collisions across files.
251    /// For example, multiple files may have a `main` function - all are indexed.
252    fn build_symbol_index(&self, repo: &Repository) -> HashMap<String, Vec<String>> {
253        let mut index: HashMap<String, Vec<String>> = HashMap::new();
254        for file in &repo.files {
255            // Index by file path (without extension) - supports all 22 languages
256            let path_key = Self::strip_language_extension(&file.relative_path);
257
258            for symbol in &file.symbols {
259                let symbol_key = format!("{}:{}", file.relative_path, symbol.name);
260
261                // Index by symbol name (multi-value to handle collisions)
262                index
263                    .entry(symbol.name.clone())
264                    .or_default()
265                    .push(symbol_key.clone());
266
267                // Index by path component (only for the first symbol to maintain backwards compat)
268                // This allows import resolution like "from utils import foo" -> "utils.py:foo"
269                index
270                    .entry(path_key.to_owned())
271                    .or_default()
272                    .push(symbol_key);
273            }
274        }
275        index
276    }
277
278    /// Fast reference extraction using pre-built index
279    ///
280    /// Uses multi-value index to properly resolve all matching symbols when
281    /// there are name collisions across files.
282    fn extract_references_fast(
283        &self,
284        graph: &mut SymbolGraph,
285        repo: &Repository,
286        index: &HashMap<String, Vec<String>>,
287    ) {
288        for file in &repo.files {
289            for symbol in &file.symbols {
290                let from_key = format!("{}:{}", file.relative_path, symbol.name);
291
292                // Extract import references
293                if symbol.kind == SymbolKind::Import {
294                    // Fast lookup using index - connect to all matching symbols
295                    if let Some(targets) = index.get(&symbol.name) {
296                        for target in targets {
297                            // Don't create self-references
298                            if target != &from_key {
299                                graph.add_reference(&from_key, target, graph::EdgeType::Imports);
300                            }
301                        }
302                    }
303                }
304
305                // Extract inheritance relationships (extends)
306                if let Some(ref base_class) = symbol.extends {
307                    // Try fast lookup first, then fallback to name-based search
308                    if let Some(targets) = index.get(base_class) {
309                        for target in targets {
310                            if target != &from_key {
311                                graph.add_reference(&from_key, target, graph::EdgeType::Inherits);
312                            }
313                        }
314                    } else {
315                        graph.add_reference_by_name(
316                            &from_key,
317                            base_class,
318                            graph::EdgeType::Inherits,
319                        );
320                    }
321                }
322
323                // Extract interface/trait implementations
324                for iface in &symbol.implements {
325                    if let Some(targets) = index.get(iface) {
326                        for target in targets {
327                            if target != &from_key {
328                                graph.add_reference(&from_key, target, graph::EdgeType::Implements);
329                            }
330                        }
331                    } else {
332                        graph.add_reference_by_name(&from_key, iface, graph::EdgeType::Implements);
333                    }
334                }
335
336                // Extract function call relationships
337                for called_name in &symbol.calls {
338                    if let Some(targets) = index.get(called_name) {
339                        for target in targets {
340                            if target != &from_key {
341                                graph.add_reference(&from_key, target, graph::EdgeType::Calls);
342                            }
343                        }
344                    } else {
345                        graph.add_reference_by_name(&from_key, called_name, graph::EdgeType::Calls);
346                    }
347                }
348            }
349        }
350    }
351
352    /// Build ranked symbols using pre-computed ranks
353    fn build_ranked_symbols_fast(
354        &self,
355        graph: &SymbolGraph,
356        ranks: &HashMap<String, f64>,
357    ) -> Vec<RankedSymbol> {
358        let top_nodes = graph.get_top_symbols_with_ranks(ranks, self.effective_max_symbols());
359
360        top_nodes
361            .iter()
362            .enumerate()
363            .map(|(i, node)| {
364                let key = format!("{}:{}", node.file_path, node.symbol.name);
365                let rank_score = ranks.get(&key).copied().unwrap_or(0.0);
366
367                RankedSymbol {
368                    name: node.symbol.name.clone(),
369                    kind: node.symbol.kind.name().to_owned(),
370                    file: node.file_path.clone(),
371                    line: node.symbol.start_line,
372                    signature: node.symbol.signature.clone(),
373                    summary: summarize_symbol(&node.symbol),
374                    references: node.symbol.references,
375                    rank: (i + 1) as u32,
376                    importance: rank_score as f32,
377                }
378            })
379            .collect()
380    }
381
382    fn build_module_graph(&self, repo: &Repository) -> ModuleGraph {
383        let mut modules: HashMap<String, ModuleNode> = HashMap::new();
384        let mut edge_counts: HashMap<(String, String), u32> = HashMap::new();
385
386        // Build file index by module (first pass)
387        for file in &repo.files {
388            let module = file
389                .relative_path
390                .split('/')
391                .next()
392                .unwrap_or("root")
393                .to_owned();
394
395            let entry = modules.entry(module.clone()).or_insert(ModuleNode {
396                name: module.clone(),
397                files: 0,
398                tokens: 0,
399            });
400
401            entry.files += 1;
402            entry.tokens += file.token_count.get(self.model);
403        }
404
405        // Build a map of symbol names to their modules for cross-reference
406        let mut symbol_to_module: HashMap<String, String> = HashMap::new();
407        for file in &repo.files {
408            let module = file
409                .relative_path
410                .split('/')
411                .next()
412                .unwrap_or("root")
413                .to_owned();
414
415            for symbol in &file.symbols {
416                symbol_to_module.insert(symbol.name.clone(), module.clone());
417            }
418        }
419
420        // Compute edges based on imports and calls between modules
421        for file in &repo.files {
422            let from_module = file
423                .relative_path
424                .split('/')
425                .next()
426                .unwrap_or("root")
427                .to_owned();
428
429            for symbol in &file.symbols {
430                // Count import references to other modules
431                if symbol.kind == SymbolKind::Import {
432                    if let Some(target_module) = symbol_to_module.get(&symbol.name) {
433                        if target_module != &from_module {
434                            *edge_counts
435                                .entry((from_module.clone(), target_module.clone()))
436                                .or_insert(0) += 1;
437                        }
438                    }
439                }
440
441                // Count function calls to other modules
442                for called_name in &symbol.calls {
443                    if let Some(target_module) = symbol_to_module.get(called_name) {
444                        if target_module != &from_module {
445                            *edge_counts
446                                .entry((from_module.clone(), target_module.clone()))
447                                .or_insert(0) += 1;
448                        }
449                    }
450                }
451            }
452        }
453
454        // Convert edge counts to edges
455        let edges: Vec<ModuleEdge> = edge_counts
456            .into_iter()
457            .map(|((from, to), weight)| ModuleEdge { from, to, weight })
458            .collect();
459
460        ModuleGraph { nodes: modules.into_values().collect(), edges }
461    }
462
463    fn build_file_index(&self, repo: &Repository, symbol_count: usize) -> Vec<FileIndexEntry> {
464        let mut files: Vec<_> = repo
465            .files
466            .iter()
467            .map(|f| {
468                let importance = if f.importance > 0.8 {
469                    "critical"
470                } else if f.importance > 0.6 {
471                    "high"
472                } else if f.importance > 0.3 {
473                    "normal"
474                } else {
475                    "low"
476                };
477
478                FileIndexEntry {
479                    path: f.relative_path.clone(),
480                    tokens: f.token_count.get(self.model),
481                    importance: importance.to_owned(),
482                    summary: None,
483                }
484            })
485            .collect();
486
487        // Sort by importance
488        files.sort_by(|a, b| {
489            let a_imp = match a.importance.as_str() {
490                "critical" => 4,
491                "high" => 3,
492                "normal" => 2,
493                _ => 1,
494            };
495            let b_imp = match b.importance.as_str() {
496                "critical" => 4,
497                "high" => 3,
498                "normal" => 2,
499                _ => 1,
500            };
501            b_imp.cmp(&a_imp)
502        });
503
504        // Enforce token budget: limit file entries based on remaining budget
505        // Uses constants defined at module level for token estimation
506        let symbol_tokens = symbol_count as u32 * TOKENS_PER_SYMBOL;
507        let remaining_budget = self
508            .token_budget
509            .saturating_sub(symbol_tokens)
510            .saturating_sub(TOKEN_OVERHEAD);
511        let max_files = (remaining_budget / TOKENS_PER_FILE) as usize;
512
513        if files.len() > max_files && max_files > 0 {
514            files.truncate(max_files);
515        }
516
517        files
518    }
519
520    fn generate_summary(&self, repo: &Repository, symbols: &[RankedSymbol]) -> String {
521        // Deduplicate modules - take top symbols until we have 3 unique modules
522        let mut seen = std::collections::HashSet::new();
523        let top_modules: Vec<_> = symbols
524            .iter()
525            .filter_map(|s| s.file.split('/').next())
526            .filter(|m| seen.insert(*m))
527            .take(3)
528            .collect();
529
530        let primary_lang = repo
531            .metadata
532            .languages
533            .first()
534            .map(|l| l.language.as_str())
535            .unwrap_or("unknown");
536
537        format!(
538            "Repository: {} ({} files, {} lines)\n\
539             Primary language: {}\n\
540             Key modules: {}",
541            repo.name,
542            repo.metadata.total_files,
543            repo.metadata.total_lines,
544            primary_lang,
545            top_modules.join(", ")
546        )
547    }
548
549    fn estimate_tokens(&self, symbols: &[RankedSymbol], files: &[FileIndexEntry]) -> u32 {
550        // Uses constants defined at module level for token estimation
551        let symbol_tokens = symbols.len() as u32 * TOKENS_PER_SYMBOL;
552        let file_tokens = files.len() as u32 * TOKENS_PER_FILE;
553
554        symbol_tokens + file_tokens + TOKEN_OVERHEAD
555    }
556
557    /// Strip language extension from a file path for indexing
558    /// Supports all 22 languages from the parser module
559    fn strip_language_extension(path: &str) -> &str {
560        // Extensions ordered by specificity (longer first to avoid partial matches)
561        const EXTENSIONS: &[&str] = &[
562            // Multi-char extensions first
563            ".gemspec",
564            ".fsscript",
565            // TypeScript/JavaScript
566            ".tsx",
567            ".jsx",
568            ".mjs",
569            ".cjs",
570            ".ts",
571            ".js",
572            // C/C++
573            ".cpp",
574            ".cxx",
575            ".hpp",
576            ".hxx",
577            ".cc",
578            ".hh",
579            ".c",
580            ".h",
581            // C#
582            ".cs",
583            // Ruby
584            ".rb",
585            ".rake",
586            // Shell
587            ".bash",
588            ".zsh",
589            ".fish",
590            ".sh",
591            // PHP
592            ".phtml",
593            ".php3",
594            ".php4",
595            ".php5",
596            ".phps",
597            ".php",
598            // Kotlin
599            ".kts",
600            ".kt",
601            // Swift
602            ".swift",
603            // Scala
604            ".scala",
605            ".sc",
606            // Haskell
607            ".lhs",
608            ".hs",
609            // Elixir
610            ".heex",
611            ".leex",
612            ".exs",
613            ".eex",
614            ".ex",
615            // Clojure
616            ".cljs",
617            ".cljc",
618            ".edn",
619            ".clj",
620            // OCaml
621            ".mli",
622            ".ml",
623            // F#
624            ".fsx",
625            ".fsi",
626            ".fs",
627            // Lua
628            ".lua",
629            // R
630            ".rmd",
631            ".r",
632            // Python
633            ".pyw",
634            ".py",
635            // Rust
636            ".rs",
637            // Go
638            ".go",
639            // Java
640            ".java",
641        ];
642
643        for ext in EXTENSIONS {
644            if let Some(stripped) = path.strip_suffix(ext) {
645                return stripped;
646            }
647        }
648        path
649    }
650}
651
652#[cfg(test)]
653#[allow(clippy::str_to_string)]
654mod tests {
655    use super::*;
656    use crate::types::{RepoMetadata, TokenCounts};
657
658    fn create_test_repo() -> Repository {
659        Repository {
660            name: "test-repo".to_owned(),
661            path: "/tmp/test".into(),
662            files: vec![RepoFile {
663                path: "/tmp/test/src/main.py".into(),
664                relative_path: "src/main.py".to_string(),
665                language: Some("python".to_string()),
666                size_bytes: 1000,
667                token_count: TokenCounts {
668                    o200k: 240,
669                    cl100k: 245,
670                    claude: 250,
671                    gemini: 230,
672                    llama: 235,
673                    mistral: 235,
674                    deepseek: 235,
675                    qwen: 235,
676                    cohere: 238,
677                    grok: 235,
678                },
679                symbols: vec![Symbol {
680                    name: "main".to_string(),
681                    kind: SymbolKind::Function,
682                    signature: Some("def main() -> None".to_string()),
683                    docstring: Some("Entry point".to_string()),
684                    start_line: 10,
685                    end_line: 25,
686                    references: 5,
687                    importance: 0.9,
688                    parent: None,
689                    visibility: crate::types::Visibility::Public,
690                    calls: vec!["helper".to_string(), "process".to_string()],
691                    extends: None,
692                    implements: vec![],
693                }],
694                importance: 0.9,
695                content: None,
696            }],
697            metadata: RepoMetadata {
698                total_files: 1,
699                total_lines: 100,
700                total_tokens: TokenCounts {
701                    o200k: 240,
702                    cl100k: 245,
703                    claude: 250,
704                    gemini: 230,
705                    llama: 235,
706                    mistral: 235,
707                    deepseek: 235,
708                    qwen: 235,
709                    cohere: 238,
710                    grok: 235,
711                },
712                languages: vec![crate::types::LanguageStats {
713                    language: "Python".to_string(),
714                    files: 1,
715                    lines: 100,
716                    percentage: 100.0,
717                }],
718                framework: None,
719                description: None,
720                branch: None,
721                commit: None,
722                directory_structure: None,
723                external_dependencies: vec![],
724                git_history: None,
725            },
726        }
727    }
728
729    #[test]
730    fn test_generate_repomap() {
731        let repo = create_test_repo();
732        let generator = RepoMapGenerator::new(2000);
733        let map = generator.generate(&repo);
734
735        assert!(!map.summary.is_empty());
736        assert!(!map.file_index.is_empty());
737    }
738}