scribe_analysis/heuristics/
import_analysis.rs

1//! # Import Graph Analysis and Centrality Calculation
2//!
3//! Implements dependency graph construction and PageRank centrality calculation
4//! for the heuristic scoring system. This module provides:
5//!
6//! - AST-based import statement parsing and normalization
7//! - Dependency graph construction with intelligent matching
8//! - PageRank centrality computation
9//! - Import-to-file path matching heuristics
10//!
11//! The centrality scores help identify important files that are heavily depended upon.
12//! Uses tree-sitter AST parsing instead of regex patterns for accurate import extraction.
13
14use std::collections::HashMap;
15use std::path::Path;
16use scribe_core::Result;
17use super::ScanResult;
18use crate::ast_import_parser::{SimpleAstParser, ImportLanguage, SimpleImport};
19
20/// Dependency graph for centrality calculation
21#[derive(Debug, Clone)]
22pub struct ImportGraph {
23    /// Node index to file path mapping
24    nodes: Vec<String>,
25    /// File path to node index mapping
26    path_to_index: HashMap<String, usize>,
27    /// Adjacency list: node -> list of nodes it depends on
28    dependencies: Vec<Vec<usize>>,
29    /// Reverse adjacency list: node -> list of nodes that depend on it
30    dependents: Vec<Vec<usize>>,
31    /// Cached PageRank scores
32    pagerank_scores: Option<Vec<f64>>,
33}
34
35/// Builder for constructing import graphs from scan results
36#[derive(Debug)]
37pub struct ImportGraphBuilder {
38    /// Import path normalization cache
39    normalization_cache: HashMap<String, String>,
40    /// AST parser for extracting imports
41    ast_parser: SimpleAstParser,
42}
43
44
45/// Centrality calculator implementing PageRank algorithm
46#[derive(Debug)]
47pub struct CentralityCalculator {
48    /// PageRank damping factor (typically 0.85)
49    damping_factor: f64,
50    /// Maximum iterations for convergence
51    max_iterations: usize,
52    /// Convergence tolerance
53    tolerance: f64,
54}
55
56impl Default for ImportGraphBuilder {
57    fn default() -> Self {
58        Self::new().expect("Failed to create ImportGraphBuilder")
59    }
60}
61
62impl ImportGraphBuilder {
63    /// Create a new import graph builder
64    pub fn new() -> Result<Self> {
65        Ok(Self {
66            normalization_cache: HashMap::new(),
67            ast_parser: SimpleAstParser::new()?,
68        })
69    }
70    
71    /// Build import graph from scan results
72    pub fn build_graph<T>(&mut self, files: &[T]) -> Result<ImportGraph> 
73    where 
74        T: ScanResult,
75    {
76        let mut graph = ImportGraph::new();
77        
78        // Add all files as nodes
79        for file in files {
80            graph.add_node(file.path().to_string());
81        }
82        
83        // Build edges based on import relationships
84        for (file_idx, file) in files.iter().enumerate() {
85            if let Some(imports) = file.imports() {
86                for import_path in imports {
87                    if let Some(target_idx) = self.find_matching_file(import_path, files) {
88                        graph.add_edge(file_idx, target_idx);
89                    }
90                }
91            }
92        }
93        
94        Ok(graph)
95    }
96    
97    /// Find which file an import statement refers to
98    fn find_matching_file<T>(&mut self, import_path: &str, files: &[T]) -> Option<usize>
99    where 
100        T: ScanResult,
101    {
102        let normalized_import = self.normalize_import_path(import_path);
103        
104        for (idx, file) in files.iter().enumerate() {
105            if import_matches_file(&normalized_import, file.path()) {
106                return Some(idx);
107            }
108        }
109        
110        None
111    }
112    
113    /// Normalize import path for matching
114    fn normalize_import_path(&mut self, import_path: &str) -> String {
115        // Check cache first
116        if let Some(cached) = self.normalization_cache.get(import_path) {
117            return cached.clone();
118        }
119        
120        let normalized = self.perform_normalization(import_path);
121        self.normalization_cache.insert(import_path.to_string(), normalized.clone());
122        normalized
123    }
124    
125    /// Perform actual import path normalization
126    fn perform_normalization(&self, import_path: &str) -> String {
127        let mut normalized = import_path.to_string();
128        
129        // Remove quotes and whitespace
130        normalized = normalized.trim_matches(|c| c == '"' || c == '\'' || c == ' ').to_string();
131        
132        // Normalize path separators
133        normalized = normalized.replace('\\', "/");
134        
135        // Remove file extensions for language-agnostic matching
136        if let Some(dot_pos) = normalized.rfind('.') {
137            // Keep extension only for specific cases
138            let extension = &normalized[dot_pos..];
139            match extension {
140                ".json" | ".xml" | ".yaml" | ".toml" | ".css" | ".scss" => {
141                    // Keep these extensions as they're often imported directly
142                }
143                _ => {
144                    // Remove programming language extensions
145                    normalized.truncate(dot_pos);
146                }
147            }
148        }
149        
150        // Handle relative imports
151        if normalized.starts_with("./") || normalized.starts_with("../") {
152            // Keep relative structure but normalize
153            normalized = self.normalize_relative_path(&normalized);
154        }
155        
156        // Convert module-style imports to file paths
157        normalized = normalized.replace("::", "/").replace('.', "/");
158        
159        // Handle common import aliases
160        normalized = self.resolve_import_aliases(&normalized);
161        
162        normalized
163    }
164    
165    /// Normalize relative path imports
166    fn normalize_relative_path(&self, path: &str) -> String {
167        // Simple path normalization - in practice, this would be more sophisticated
168        let components: Vec<&str> = path.split('/').collect();
169        let mut normalized = Vec::new();
170        
171        for component in components {
172            match component {
173                "" | "." => continue,
174                ".." => {
175                    normalized.pop();
176                }
177                _ => normalized.push(component),
178            }
179        }
180        
181        normalized.join("/")
182    }
183    
184    /// Resolve common import aliases and patterns
185    fn resolve_import_aliases(&self, import_path: &str) -> String {
186        // Handle common aliases
187        match import_path {
188            // Node.js built-ins
189            "fs" | "path" | "http" | "https" | "url" | "crypto" | "os" => {
190                return import_path.to_string(); // Keep as-is for built-ins
191            }
192            // Package imports (usually not files in the repo)
193            path if !path.contains('/') && !path.starts_with('.') => {
194                return import_path.to_string(); // Likely external package
195            }
196            _ => {}
197        }
198        
199        let mut resolved = import_path.to_string();
200        
201        // Common alias patterns
202        if resolved.starts_with("@/") {
203            resolved = resolved.replacen("@/", "src/", 1);
204        }
205        
206        if resolved.starts_with("~") {
207            resolved = resolved.replacen("~", "src", 1);
208        }
209        
210        resolved
211    }
212    
213    /// Detect language from file extension for AST parsing
214    fn detect_language_from_path(&self, file_path: &str) -> Option<ImportLanguage> {
215        let extension = Path::new(file_path)
216            .extension()
217            .and_then(|ext| ext.to_str())
218            .unwrap_or("");
219        
220        ImportLanguage::from_extension(extension)
221    }
222    
223    /// Extract imports from file content using AST parsing
224    pub fn extract_imports(&self, content: &str, file_path: &str) -> Vec<String> {
225        // Detect language from file path
226        let language = match self.detect_language_from_path(file_path) {
227            Some(lang) => lang,
228            None => return Vec::new(),
229        };
230        
231        // Use AST parser to extract imports
232        match self.ast_parser.extract_imports(content, language) {
233            Ok(simple_imports) => {
234                simple_imports.into_iter()
235                    .map(|import| import.module)
236                    .collect()
237            }
238            Err(_) => {
239                // Fall back to empty list if parsing fails
240                Vec::new()
241            }
242        }
243    }
244}
245
246impl ImportGraph {
247    /// Create a new empty import graph
248    pub fn new() -> Self {
249        Self {
250            nodes: Vec::new(),
251            path_to_index: HashMap::new(),
252            dependencies: Vec::new(),
253            dependents: Vec::new(),
254            pagerank_scores: None,
255        }
256    }
257    
258    /// Add a node to the graph
259    pub fn add_node(&mut self, path: String) -> usize {
260        if let Some(&existing_idx) = self.path_to_index.get(&path) {
261            return existing_idx;
262        }
263        
264        let index = self.nodes.len();
265        self.nodes.push(path.clone());
266        self.path_to_index.insert(path, index);
267        self.dependencies.push(Vec::new());
268        self.dependents.push(Vec::new());
269        
270        // Invalidate cached PageRank scores
271        self.pagerank_scores = None;
272        
273        index
274    }
275    
276    /// Add an edge from source to target (source depends on target)
277    pub fn add_edge(&mut self, source: usize, target: usize) {
278        if source < self.dependencies.len() && target < self.dependents.len() && source != target {
279            // Avoid duplicate edges
280            if !self.dependencies[source].contains(&target) {
281                self.dependencies[source].push(target);
282                self.dependents[target].push(source);
283                
284                // Invalidate cached PageRank scores
285                self.pagerank_scores = None;
286            }
287        }
288    }
289    
290    /// Get node degrees (in-degree, out-degree)
291    pub fn get_node_degrees(&self, path: &str) -> Option<(usize, usize)> {
292        let index = *self.path_to_index.get(path)?;
293        let in_degree = self.dependents[index].len();
294        let out_degree = self.dependencies[index].len();
295        Some((in_degree, out_degree))
296    }
297    
298    /// Get PageRank scores for all nodes
299    pub fn get_pagerank_scores(&mut self) -> Result<&[f64]> {
300        if self.pagerank_scores.is_none() {
301            let calculator = CentralityCalculator::default();
302            let scores = calculator.calculate_pagerank(self)?;
303            self.pagerank_scores = Some(scores);
304        }
305        
306        Ok(self.pagerank_scores.as_ref().unwrap())
307    }
308    
309    /// Get PageRank score for a specific file
310    pub fn get_pagerank_score(&mut self, path: &str) -> Result<f64> {
311        let index = *self.path_to_index.get(path).unwrap_or(&0);
312        let scores = self.get_pagerank_scores()?;
313        Ok(*scores.get(index).unwrap_or(&0.0))
314    }
315    
316    /// Get graph statistics
317    pub fn stats(&self) -> GraphStats {
318        let node_count = self.nodes.len();
319        let edge_count: usize = self.dependencies.iter().map(|deps| deps.len()).sum();
320        
321        let in_degrees: Vec<usize> = self.dependents.iter().map(|deps| deps.len()).collect();
322        let out_degrees: Vec<usize> = self.dependencies.iter().map(|deps| deps.len()).collect();
323        
324        let avg_in_degree = if node_count > 0 {
325            in_degrees.iter().sum::<usize>() as f64 / node_count as f64
326        } else {
327            0.0
328        };
329        
330        let avg_out_degree = if node_count > 0 {
331            out_degrees.iter().sum::<usize>() as f64 / node_count as f64
332        } else {
333            0.0
334        };
335        
336        let max_in_degree = in_degrees.iter().max().cloned().unwrap_or(0);
337        let max_out_degree = out_degrees.iter().max().cloned().unwrap_or(0);
338        
339        let density = if node_count > 1 {
340            edge_count as f64 / (node_count * (node_count - 1)) as f64
341        } else {
342            0.0
343        };
344        
345        GraphStats {
346            node_count,
347            edge_count,
348            avg_in_degree,
349            avg_out_degree,
350            max_in_degree,
351            max_out_degree,
352            density,
353        }
354    }
355}
356
357impl Default for ImportGraph {
358    fn default() -> Self {
359        Self::new()
360    }
361}
362
363impl CentralityCalculator {
364    /// Create a new centrality calculator with default parameters
365    pub fn new() -> Self {
366        Self {
367            damping_factor: 0.85,
368            max_iterations: 100,
369            tolerance: 1e-6,
370        }
371    }
372    
373    /// Create with custom parameters
374    pub fn with_params(damping_factor: f64, max_iterations: usize, tolerance: f64) -> Self {
375        Self {
376            damping_factor,
377            max_iterations,
378            tolerance,
379        }
380    }
381    
382    /// Calculate PageRank scores for the graph
383    pub fn calculate_pagerank(&self, graph: &ImportGraph) -> Result<Vec<f64>> {
384        let n = graph.nodes.len();
385        if n == 0 {
386            return Ok(Vec::new());
387        }
388        
389        // Initialize scores uniformly
390        let mut scores = vec![1.0 / n as f64; n];
391        let mut new_scores = vec![0.0; n];
392        
393        for _iteration in 0..self.max_iterations {
394            // Calculate new scores
395            for (i, new_score) in new_scores.iter_mut().enumerate() {
396                let mut rank_sum = 0.0;
397                
398                // Sum contributions from nodes pointing to this one
399                for &source in &graph.dependents[i] {
400                    let out_degree = graph.dependencies[source].len();
401                    if out_degree > 0 {
402                        rank_sum += scores[source] / out_degree as f64;
403                    }
404                }
405                
406                // Apply PageRank formula
407                *new_score = (1.0 - self.damping_factor) / n as f64 + self.damping_factor * rank_sum;
408            }
409            
410            // Check for convergence
411            let mut max_diff = 0.0;
412            for i in 0..n {
413                let diff = (new_scores[i] - scores[i]).abs();
414                if diff > max_diff {
415                    max_diff = diff;
416                }
417            }
418            
419            // Swap score vectors
420            std::mem::swap(&mut scores, &mut new_scores);
421            
422            if max_diff < self.tolerance {
423                break;
424            }
425        }
426        
427        Ok(scores)
428    }
429}
430
431impl Default for CentralityCalculator {
432    fn default() -> Self {
433        Self::new()
434    }
435}
436
437/// Graph statistics for analysis
438#[derive(Debug, Clone)]
439pub struct GraphStats {
440    pub node_count: usize,
441    pub edge_count: usize,
442    pub avg_in_degree: f64,
443    pub avg_out_degree: f64,
444    pub max_in_degree: usize,
445    pub max_out_degree: usize,
446    pub density: f64,
447}
448
449/// Check if an import statement matches a file path
450pub fn import_matches_file(import_path: &str, file_path: &str) -> bool {
451    // Normalize both paths for comparison
452    let normalized_import = normalize_for_matching(import_path);
453    let normalized_file = normalize_for_matching(file_path);
454    
455    // Direct match
456    if normalized_import == normalized_file {
457        return true;
458    }
459    
460    // Check if import is a substring of file path (for module imports)
461    if normalized_file.contains(&normalized_import) {
462        return true;
463    }
464    
465    // Check if file path ends with import (common for relative imports)
466    if normalized_file.ends_with(&normalized_import) {
467        return true;
468    }
469    
470    // Check module-style imports (convert :: to /)
471    let module_import = normalized_import.replace("::", "/");
472    if normalized_file.contains(&module_import) || 
473       module_import.contains(&normalized_file) {
474        return true;
475    }
476    
477    // Special handling for std library and common patterns
478    if normalized_import.starts_with("std::") {
479        let std_path = normalized_import.replace("::", "/");
480        if normalized_file.replace("_", "").contains(&std_path.replace("_", "")) {
481            return true;
482        }
483    }
484    
485    // Handle index files
486    if is_index_file(file_path) {
487        let directory = std::path::Path::new(file_path)
488            .parent()
489            .and_then(|p| p.to_str())
490            .unwrap_or("");
491        let normalized_dir = normalize_for_matching(directory);
492        if normalized_import == normalized_dir {
493            return true;
494        }
495    }
496    
497    false
498}
499
500/// Normalize paths for matching comparison
501fn normalize_for_matching(path: &str) -> String {
502    let mut normalized = path.to_lowercase();
503    
504    // Remove common prefixes
505    if normalized.starts_with("./") {
506        normalized = normalized[2..].to_string();
507    }
508    
509    if normalized.starts_with("src/") {
510        normalized = normalized[4..].to_string();
511    }
512    
513    // Remove file extensions
514    if let Some(dot_pos) = normalized.rfind('.') {
515        normalized.truncate(dot_pos);
516    }
517    
518    // Normalize separators
519    normalized = normalized.replace('\\', "/");
520    
521    // Remove trailing slashes
522    normalized.trim_end_matches('/').to_string()
523}
524
525/// Check if a file is an index file (index.js, __init__.py, mod.rs, etc.)
526fn is_index_file(file_path: &str) -> bool {
527    let file_name = std::path::Path::new(file_path)
528        .file_stem()
529        .and_then(|stem| stem.to_str())
530        .unwrap_or("");
531    
532    matches!(file_name, "index" | "__init__" | "mod" | "main")
533}
534
535#[cfg(test)]
536mod tests {
537    use super::*;
538    use super::super::DocumentAnalysis;
539    
540    // Mock scan result for testing
541    #[derive(Debug)]
542    struct MockScanResult {
543        path: String,
544        relative_path: String,
545        depth: usize,
546        imports: Option<Vec<String>>,
547    }
548    
549    impl MockScanResult {
550        fn new(path: &str, imports: Vec<&str>) -> Self {
551            Self {
552                path: path.to_string(),
553                relative_path: path.to_string(),
554                depth: path.matches('/').count(),
555                imports: Some(imports.iter().map(|s| s.to_string()).collect()),
556            }
557        }
558    }
559    
560    impl ScanResult for MockScanResult {
561        fn path(&self) -> &str { &self.path }
562        fn relative_path(&self) -> &str { &self.relative_path }
563        fn depth(&self) -> usize { self.depth }
564        fn is_docs(&self) -> bool { false }
565        fn is_readme(&self) -> bool { false }
566        fn is_test(&self) -> bool { false }
567        fn is_entrypoint(&self) -> bool { false }
568        fn has_examples(&self) -> bool { false }
569        fn priority_boost(&self) -> f64 { 0.0 }
570        fn churn_score(&self) -> f64 { 0.0 }
571        fn centrality_in(&self) -> f64 { 0.0 }
572        fn imports(&self) -> Option<&[String]> { self.imports.as_deref() }
573        fn doc_analysis(&self) -> Option<&DocumentAnalysis> { None }
574    }
575    
576    #[test]
577    fn test_import_graph_creation() {
578        let mut graph = ImportGraph::new();
579        
580        let idx1 = graph.add_node("file1.rs".to_string());
581        let idx2 = graph.add_node("file2.rs".to_string());
582        
583        assert_eq!(idx1, 0);
584        assert_eq!(idx2, 1);
585        assert_eq!(graph.nodes.len(), 2);
586        
587        graph.add_edge(idx1, idx2);
588        assert_eq!(graph.dependencies[idx1].len(), 1);
589        assert_eq!(graph.dependents[idx2].len(), 1);
590    }
591    
592    #[test]
593    fn test_graph_builder() {
594        let files = vec![
595            MockScanResult::new("src/main.rs", vec!["src/lib.rs", "src/utils.rs"]),
596            MockScanResult::new("src/lib.rs", vec!["src/utils.rs"]),
597            MockScanResult::new("src/utils.rs", vec![]),
598        ];
599        
600        let mut builder = ImportGraphBuilder::new().unwrap();
601        let result = builder.build_graph(&files);
602        assert!(result.is_ok());
603        
604        let graph = result.unwrap();
605        assert_eq!(graph.nodes.len(), 3);
606        
607        let stats = graph.stats();
608        assert_eq!(stats.node_count, 3);
609        assert!(stats.edge_count > 0);
610    }
611    
612    #[test]
613    fn test_import_matching() {
614        // Direct matches
615        assert!(import_matches_file("src/utils", "src/utils.rs"));
616        assert!(import_matches_file("./lib", "src/lib.js"));
617        
618        // Module-style matches
619        assert!(import_matches_file("std::collections::HashMap", "std/collections/hash_map.rs"));
620        
621        // Index file matches
622        assert!(import_matches_file("src/components", "src/components/index.js"));
623        
624        // Non-matches
625        assert!(!import_matches_file("completely_different", "src/utils.rs"));
626    }
627    
628    #[test]
629    fn test_path_normalization() {
630        let mut builder = ImportGraphBuilder::new().unwrap();
631        
632        // Test various import formats
633        let normalized1 = builder.normalize_import_path("\"./utils.js\"");
634        assert_eq!(normalized1, "utils");
635        
636        let normalized2 = builder.normalize_import_path("../lib/helper.ts");
637        assert!(normalized2.contains("helper"));
638        
639        let normalized3 = builder.normalize_import_path("@/components/Button");
640        assert!(normalized3.contains("src/components/Button"));
641    }
642    
643    #[test]
644    fn test_pagerank_calculation() {
645        let mut graph = ImportGraph::new();
646        
647        // Create a simple graph: A -> B -> C, A -> C
648        let idx_a = graph.add_node("A".to_string());
649        let idx_b = graph.add_node("B".to_string());
650        let idx_c = graph.add_node("C".to_string());
651        
652        graph.add_edge(idx_a, idx_b);
653        graph.add_edge(idx_b, idx_c);
654        graph.add_edge(idx_a, idx_c);
655        
656        let scores = graph.get_pagerank_scores();
657        assert!(scores.is_ok());
658        
659        let scores = scores.unwrap();
660        assert_eq!(scores.len(), 3);
661        
662        // C should have the highest score (most depended upon)
663        assert!(scores[idx_c] > scores[idx_a]);
664        assert!(scores[idx_c] > scores[idx_b]);
665    }
666    
667    #[test]
668    fn test_import_extraction() {
669        let builder = ImportGraphBuilder::new().unwrap();
670        
671        // JavaScript content
672        let js_content = r#"
673            import { Component } from 'react';
674            import utils from './utils.js';
675            const fs = require('fs');
676        "#;
677        
678        let imports = builder.extract_imports(js_content, "test.js");
679        assert!(imports.len() >= 2);
680        assert!(imports.contains(&"react".to_string()));
681        assert!(imports.contains(&"./utils.js".to_string()));
682        
683        // Rust content
684        let rust_content = r#"
685            use std::collections::HashMap;
686            use crate::utils::helper;
687            mod tests;
688        "#;
689        
690        let imports = builder.extract_imports(rust_content, "test.rs");
691        assert!(imports.len() >= 2);
692        assert!(imports.contains(&"std::collections::HashMap".to_string()));
693        assert!(imports.contains(&"crate::utils::helper".to_string()));
694    }
695    
696    #[test]
697    fn test_centrality_calculator() {
698        let mut graph = ImportGraph::new();
699        
700        // Create a star graph with central node
701        let center = graph.add_node("center.rs".to_string());
702        for i in 1..=5 {
703            let node = graph.add_node(format!("node{}.rs", i));
704            graph.add_edge(node, center); // All nodes depend on center
705        }
706        
707        let calculator = CentralityCalculator::default();
708        let scores = calculator.calculate_pagerank(&graph);
709        assert!(scores.is_ok());
710        
711        let scores = scores.unwrap();
712        assert_eq!(scores.len(), 6);
713        
714        // Center node should have highest PageRank
715        assert!(scores[center] > scores[1]); // Higher than any leaf node
716    }
717    
718    #[test]
719    fn test_graph_statistics() {
720        let mut graph = ImportGraph::new();
721        
722        for i in 0..5 {
723            graph.add_node(format!("file{}.rs", i));
724        }
725        
726        // Add some edges
727        graph.add_edge(0, 1);
728        graph.add_edge(1, 2);
729        graph.add_edge(2, 3);
730        graph.add_edge(0, 4);
731        
732        let stats = graph.stats();
733        assert_eq!(stats.node_count, 5);
734        assert_eq!(stats.edge_count, 4);
735        assert!(stats.avg_out_degree > 0.0);
736        assert!(stats.density > 0.0);
737        assert!(stats.density < 1.0);
738    }
739}