scribe_graph/
statistics.rs

1//! # Graph Statistics and Analysis for Dependency Graphs
2//!
3//! Comprehensive statistical analysis and metrics computation for code dependency graphs.
4//! This module provides detailed insights into graph structure, connectivity patterns,
5//! and characteristics that are essential for understanding codebase architecture.
6//!
7//! ## Key Features
8//! - **Degree Distribution Analysis**: In-degree, out-degree, and total degree statistics
9//! - **Connectivity Metrics**: Strongly connected components, graph density, clustering
10//! - **Structural Patterns**: Hub detection, authority identification, bottleneck analysis
11//! - **Import Relationship Insights**: Dependency depth, fan-in/fan-out analysis
12//! - **Performance Characteristics**: Memory usage, computation complexity estimates
13//! - **Comparative Analysis**: Before/after graph evolution tracking
14
15use std::collections::{HashMap, VecDeque};
16use serde::{Deserialize, Serialize};
17use scribe_core::Result;
18
19use crate::graph::{DependencyGraph, NodeId, GraphStatistics};
20
21/// Comprehensive graph analysis results
22#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
23pub struct GraphAnalysisResults {
24    /// Basic graph statistics
25    pub basic_stats: GraphStatistics,
26    
27    /// Degree distribution analysis
28    pub degree_distribution: DegreeDistribution,
29    
30    /// Connectivity analysis
31    pub connectivity: ConnectivityAnalysis,
32    
33    /// Structural patterns
34    pub structural_patterns: StructuralPatterns,
35    
36    /// Import relationship insights
37    pub import_insights: ImportInsights,
38    
39    /// Performance characteristics
40    pub performance_profile: PerformanceProfile,
41    
42    /// Analysis metadata
43    pub analysis_metadata: AnalysisMetadata,
44}
45
46/// Detailed degree distribution statistics
47#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
48pub struct DegreeDistribution {
49    /// In-degree statistics
50    pub in_degree: DegreeStats,
51    
52    /// Out-degree statistics  
53    pub out_degree: DegreeStats,
54    
55    /// Total degree statistics
56    pub total_degree: DegreeStats,
57    
58    /// Degree distribution histogram (degree -> count)
59    pub in_degree_histogram: HashMap<usize, usize>,
60    pub out_degree_histogram: HashMap<usize, usize>,
61    
62    /// Power-law fitting parameters (if applicable)
63    pub power_law_alpha: Option<f64>,
64    pub power_law_goodness_of_fit: Option<f64>,
65}
66
67/// Statistical measures for degree sequences
68#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
69pub struct DegreeStats {
70    pub min: usize,
71    pub max: usize,
72    pub mean: f64,
73    pub median: f64,
74    pub std_dev: f64,
75    pub percentile_25: f64,
76    pub percentile_75: f64,
77    pub percentile_90: f64,
78    pub percentile_95: f64,
79    pub percentile_99: f64,
80}
81
82/// Graph connectivity and component analysis
83#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
84pub struct ConnectivityAnalysis {
85    /// Number of weakly connected components
86    pub weakly_connected_components: usize,
87    
88    /// Number of strongly connected components
89    pub strongly_connected_components: usize,
90    
91    /// Size of largest strongly connected component
92    pub largest_scc_size: usize,
93    
94    /// Graph density (actual edges / possible edges)
95    pub graph_density: f64,
96    
97    /// Average clustering coefficient
98    pub average_clustering: f64,
99    
100    /// Global clustering coefficient (transitivity)
101    pub global_clustering: f64,
102    
103    /// Average path length (in largest component)
104    pub average_path_length: Option<f64>,
105    
106    /// Graph diameter (longest shortest path)
107    pub diameter: Option<usize>,
108    
109    /// Is the graph acyclic (DAG)
110    pub is_acyclic: bool,
111}
112
113/// Structural patterns and notable nodes
114#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
115pub struct StructuralPatterns {
116    /// Hub nodes (high out-degree)
117    pub hubs: Vec<NodeInfo>,
118    
119    /// Authority nodes (high in-degree) 
120    pub authorities: Vec<NodeInfo>,
121    
122    /// Bottleneck nodes (high betweenness centrality estimate)
123    pub bottlenecks: Vec<NodeInfo>,
124    
125    /// Bridge nodes (critical for connectivity)
126    pub bridges: Vec<NodeInfo>,
127    
128    /// Isolated nodes (no connections)
129    pub isolated_nodes: Vec<NodeId>,
130    
131    /// Dangling nodes (no outgoing edges)
132    pub dangling_nodes: Vec<NodeId>,
133    
134    /// Leaf nodes (no incoming edges, but have outgoing)
135    pub leaf_nodes: Vec<NodeId>,
136}
137
138/// Information about important nodes
139#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
140pub struct NodeInfo {
141    pub node_id: NodeId,
142    pub score: f64,
143    pub in_degree: usize,
144    pub out_degree: usize,
145    pub metadata: Option<String>, // Additional context
146}
147
148/// Import relationship specific insights
149#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
150pub struct ImportInsights {
151    /// Average import depth (distance from root nodes)
152    pub average_import_depth: f64,
153    
154    /// Maximum import depth
155    pub max_import_depth: usize,
156    
157    /// Import fan-out distribution (how many files each file imports)
158    pub fan_out_distribution: DegreeStats,
159    
160    /// Import fan-in distribution (how many files import each file)
161    pub fan_in_distribution: DegreeStats,
162    
163    /// Circular dependency detection
164    pub circular_dependencies: Vec<CircularDependency>,
165    
166    /// Dependency layers (topological levels)
167    pub dependency_layers: Vec<Vec<NodeId>>,
168    
169    /// Critical import paths (most important dependency chains)
170    pub critical_paths: Vec<DependencyPath>,
171}
172
173/// Circular dependency information
174#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
175pub struct CircularDependency {
176    pub nodes: Vec<NodeId>,
177    pub cycle_length: usize,
178    pub strength: f64, // How many edges form the cycle
179}
180
181/// Important dependency path
182#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
183pub struct DependencyPath {
184    pub path: Vec<NodeId>,
185    pub length: usize,
186    pub importance_score: f64,
187}
188
189/// Performance characteristics of the graph
190#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
191pub struct PerformanceProfile {
192    /// Estimated memory usage for PageRank computation
193    pub pagerank_memory_estimate_mb: f64,
194    
195    /// Estimated PageRank computation time
196    pub pagerank_time_estimate_ms: u64,
197    
198    /// Graph traversal complexity
199    pub traversal_complexity: TraversalComplexity,
200    
201    /// Storage efficiency metrics
202    pub storage_efficiency: StorageEfficiency,
203}
204
205/// Complexity analysis for graph algorithms
206#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
207pub struct TraversalComplexity {
208    /// BFS/DFS time complexity class
209    pub time_complexity_class: String,
210    
211    /// Space complexity class
212    pub space_complexity_class: String,
213    
214    /// Expected iterations for convergence algorithms
215    pub expected_iterations: usize,
216}
217
218/// Storage and memory efficiency metrics
219#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
220pub struct StorageEfficiency {
221    /// Adjacency list memory usage (bytes)
222    pub adjacency_list_size_bytes: usize,
223    
224    /// Average edges per node
225    pub edges_per_node: f64,
226    
227    /// Memory overhead ratio
228    pub memory_overhead_ratio: f64,
229    
230    /// Sparsity coefficient
231    pub sparsity: f64,
232}
233
234/// Analysis execution metadata
235#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
236pub struct AnalysisMetadata {
237    /// When the analysis was performed
238    pub timestamp: chrono::DateTime<chrono::Utc>,
239    
240    /// Analysis duration in milliseconds
241    pub analysis_duration_ms: u64,
242    
243    /// Whether parallel computation was used
244    pub used_parallel: bool,
245    
246    /// Analysis configuration used
247    pub config: StatisticsConfig,
248    
249    /// Version of the statistics module
250    pub version: String,
251}
252
253impl Default for AnalysisMetadata {
254    fn default() -> Self {
255        Self {
256            timestamp: chrono::Utc::now(),
257            analysis_duration_ms: 0,
258            used_parallel: false,
259            config: StatisticsConfig::default(),
260            version: "1.0.0".to_string(),
261        }
262    }
263}
264
265/// Configuration for statistics computation
266#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
267pub struct StatisticsConfig {
268    /// Whether to compute expensive metrics (clustering, path length)
269    pub compute_expensive_metrics: bool,
270    
271    /// Whether to use parallel computation where possible
272    pub use_parallel: bool,
273    
274    /// Maximum number of nodes to analyze for expensive operations
275    pub max_nodes_for_expensive_ops: usize,
276    
277    /// Number of top nodes to include in structural patterns
278    pub top_nodes_count: usize,
279    
280    /// Minimum score threshold for pattern detection
281    pub pattern_threshold: f64,
282}
283
284impl Default for StatisticsConfig {
285    fn default() -> Self {
286        Self {
287            compute_expensive_metrics: true,
288            use_parallel: true,
289            max_nodes_for_expensive_ops: 10000,
290            top_nodes_count: 10,
291            pattern_threshold: 0.1,
292        }
293    }
294}
295
296/// Main graph statistics analyzer
297#[derive(Debug)]
298pub struct GraphStatisticsAnalyzer {
299    config: StatisticsConfig,
300}
301
302impl GraphStatisticsAnalyzer {
303    /// Create a new analyzer with default configuration
304    pub fn new() -> Self {
305        Self {
306            config: StatisticsConfig::default(),
307        }
308    }
309    
310    /// Create with custom configuration
311    pub fn with_config(config: StatisticsConfig) -> Self {
312        Self { config }
313    }
314    
315    /// Create analyzer optimized for large graphs
316    pub fn for_large_graphs() -> Self {
317        Self {
318            config: StatisticsConfig {
319                compute_expensive_metrics: false,
320                max_nodes_for_expensive_ops: 5000,
321                top_nodes_count: 5,
322                ..StatisticsConfig::default()
323            },
324        }
325    }
326    
327    /// Perform comprehensive analysis of the dependency graph
328    pub fn analyze(&self, graph: &DependencyGraph) -> Result<GraphAnalysisResults> {
329        let start_time = std::time::Instant::now();
330        
331        // Basic statistics
332        let basic_stats = self.compute_basic_statistics(graph);
333        
334        // Degree distribution analysis
335        let degree_distribution = self.analyze_degree_distribution(graph)?;
336        
337        // Connectivity analysis
338        let connectivity = self.analyze_connectivity(graph)?;
339        
340        // Structural patterns
341        let structural_patterns = self.identify_structural_patterns(graph)?;
342        
343        // Import-specific insights
344        let import_insights = self.analyze_import_patterns(graph)?;
345        
346        // Performance characteristics
347        let performance_profile = self.estimate_performance_characteristics(graph);
348        
349        // Analysis metadata
350        let analysis_metadata = AnalysisMetadata {
351            timestamp: chrono::Utc::now(),
352            analysis_duration_ms: start_time.elapsed().as_millis() as u64,
353            used_parallel: self.config.use_parallel,
354            config: self.config.clone(),
355            version: env!("CARGO_PKG_VERSION").to_string(),
356        };
357        
358        Ok(GraphAnalysisResults {
359            basic_stats,
360            degree_distribution,
361            connectivity,
362            structural_patterns,
363            import_insights,
364            performance_profile,
365            analysis_metadata,
366        })
367    }
368    
369    /// Compute basic graph statistics
370    fn compute_basic_statistics(&self, graph: &DependencyGraph) -> GraphStatistics {
371        let total_nodes = graph.node_count();
372        let total_edges = graph.edge_count();
373        
374        if total_nodes == 0 {
375            return GraphStatistics::empty();
376        }
377        
378        // Compute degree statistics
379        let degrees: Vec<_> = graph.nodes()
380            .map(|node| (graph.in_degree(node), graph.out_degree(node)))
381            .collect();
382        
383        let in_degrees: Vec<_> = degrees.iter().map(|(in_deg, _)| *in_deg).collect();
384        let out_degrees: Vec<_> = degrees.iter().map(|(_, out_deg)| *out_deg).collect();
385        
386        let in_degree_avg = in_degrees.iter().sum::<usize>() as f64 / total_nodes as f64;
387        let in_degree_max = *in_degrees.iter().max().unwrap_or(&0);
388        let out_degree_avg = out_degrees.iter().sum::<usize>() as f64 / total_nodes as f64;
389        let out_degree_max = *out_degrees.iter().max().unwrap_or(&0);
390        
391        // Count special node types
392        let isolated_nodes = degrees.iter().filter(|(in_deg, out_deg)| *in_deg == 0 && *out_deg == 0).count();
393        let dangling_nodes = degrees.iter().filter(|(_, out_deg)| *out_deg == 0).count();
394        
395        // Graph density
396        let max_possible_edges = total_nodes * (total_nodes - 1);
397        let graph_density = if max_possible_edges > 0 {
398            total_edges as f64 / max_possible_edges as f64
399        } else {
400            0.0
401        };
402        
403        GraphStatistics {
404            total_nodes,
405            total_edges,
406            in_degree_avg,
407            in_degree_max,
408            out_degree_avg,
409            out_degree_max,
410            strongly_connected_components: graph.estimate_scc_count(),
411            graph_density,
412            isolated_nodes,
413            dangling_nodes,
414        }
415    }
416    
417    /// Analyze degree distributions in detail
418    fn analyze_degree_distribution(&self, graph: &DependencyGraph) -> Result<DegreeDistribution> {
419        let degrees: Vec<_> = graph.nodes()
420            .map(|node| (graph.in_degree(node), graph.out_degree(node)))
421            .collect();
422        
423        let in_degrees: Vec<_> = degrees.iter().map(|(in_deg, _)| *in_deg).collect();
424        let out_degrees: Vec<_> = degrees.iter().map(|(_, out_deg)| *out_deg).collect();
425        let total_degrees: Vec<_> = degrees.iter().map(|(in_deg, out_deg)| in_deg + out_deg).collect();
426        
427        Ok(DegreeDistribution {
428            in_degree: self.compute_degree_stats(&in_degrees),
429            out_degree: self.compute_degree_stats(&out_degrees),
430            total_degree: self.compute_degree_stats(&total_degrees),
431            in_degree_histogram: self.compute_histogram(&in_degrees),
432            out_degree_histogram: self.compute_histogram(&out_degrees),
433            power_law_alpha: self.estimate_power_law_alpha(&in_degrees),
434            power_law_goodness_of_fit: None, // TODO: Implement KS test
435        })
436    }
437    
438    /// Compute detailed statistics for a degree sequence
439    fn compute_degree_stats(&self, degrees: &[usize]) -> DegreeStats {
440        if degrees.is_empty() {
441            return DegreeStats {
442                min: 0,
443                max: 0,
444                mean: 0.0,
445                median: 0.0,
446                std_dev: 0.0,
447                percentile_25: 0.0,
448                percentile_75: 0.0,
449                percentile_90: 0.0,
450                percentile_95: 0.0,
451                percentile_99: 0.0,
452            };
453        }
454        
455        let mut sorted_degrees = degrees.to_vec();
456        sorted_degrees.sort();
457        
458        let min = sorted_degrees[0];
459        let max = sorted_degrees[sorted_degrees.len() - 1];
460        let mean = degrees.iter().sum::<usize>() as f64 / degrees.len() as f64;
461        
462        // Percentiles
463        let median = self.percentile(&sorted_degrees, 50.0);
464        let percentile_25 = self.percentile(&sorted_degrees, 25.0);
465        let percentile_75 = self.percentile(&sorted_degrees, 75.0);
466        let percentile_90 = self.percentile(&sorted_degrees, 90.0);
467        let percentile_95 = self.percentile(&sorted_degrees, 95.0);
468        let percentile_99 = self.percentile(&sorted_degrees, 99.0);
469        
470        // Standard deviation
471        let variance = degrees.iter()
472            .map(|&deg| (deg as f64 - mean).powi(2))
473            .sum::<f64>() / degrees.len() as f64;
474        let std_dev = variance.sqrt();
475        
476        DegreeStats {
477            min,
478            max,
479            mean,
480            median,
481            std_dev,
482            percentile_25,
483            percentile_75,
484            percentile_90,
485            percentile_95,
486            percentile_99,
487        }
488    }
489    
490    /// Calculate percentile from sorted array
491    fn percentile(&self, sorted_values: &[usize], percentile: f64) -> f64 {
492        if sorted_values.is_empty() {
493            return 0.0;
494        }
495        
496        let index = (percentile / 100.0) * (sorted_values.len() - 1) as f64;
497        let lower = index.floor() as usize;
498        let upper = index.ceil() as usize;
499        
500        if lower == upper {
501            sorted_values[lower] as f64
502        } else {
503            let weight = index - lower as f64;
504            (1.0 - weight) * sorted_values[lower] as f64 + weight * sorted_values[upper] as f64
505        }
506    }
507    
508    /// Compute degree histogram
509    fn compute_histogram(&self, degrees: &[usize]) -> HashMap<usize, usize> {
510        let mut histogram = HashMap::new();
511        for &degree in degrees {
512            *histogram.entry(degree).or_insert(0) += 1;
513        }
514        histogram
515    }
516    
517    /// Estimate power-law exponent alpha using linear regression on log-log plot
518    fn estimate_power_law_alpha(&self, degrees: &[usize]) -> Option<f64> {
519        // Filter out zeros (can't take log)
520        let non_zero_degrees: Vec<_> = degrees.iter().filter(|&&d| d > 0).collect();
521        
522        if non_zero_degrees.len() < 10 {
523            return None; // Not enough data for reliable estimation
524        }
525        
526        // Create log-log data points
527        let mut log_points = Vec::new();
528        let histogram = self.compute_histogram(&non_zero_degrees.into_iter().copied().collect::<Vec<_>>());
529        
530        for (&degree, &count) in &histogram {
531            if count > 0 {
532                log_points.push((degree as f64, count as f64));
533            }
534        }
535        
536        if log_points.len() < 5 {
537            return None;
538        }
539        
540        // Linear regression: log(count) = -alpha * log(degree) + C
541        let n = log_points.len() as f64;
542        let sum_log_x: f64 = log_points.iter().map(|(x, _)| x.ln()).sum();
543        let sum_log_y: f64 = log_points.iter().map(|(_, y)| y.ln()).sum();
544        let sum_log_x_log_y: f64 = log_points.iter().map(|(x, y)| x.ln() * y.ln()).sum();
545        let sum_log_x_squared: f64 = log_points.iter().map(|(x, _)| x.ln().powi(2)).sum();
546        
547        let denominator = n * sum_log_x_squared - sum_log_x.powi(2);
548        if denominator.abs() < 1e-10 {
549            return None;
550        }
551        
552        let alpha = -(n * sum_log_x_log_y - sum_log_x * sum_log_y) / denominator;
553        
554        // Only return positive alpha (valid for power-law)
555        if alpha > 0.0 { Some(alpha) } else { None }
556    }
557    
558    /// Analyze graph connectivity properties
559    fn analyze_connectivity(&self, graph: &DependencyGraph) -> Result<ConnectivityAnalysis> {
560        let strongly_connected_components = graph.estimate_scc_count();
561        let largest_scc_size = self.estimate_largest_scc_size(graph);
562        
563        // For now, assume weakly connected components = strongly connected components
564        // A more sophisticated implementation would use Union-Find
565        let weakly_connected_components = strongly_connected_components;
566        
567        let graph_density = if graph.node_count() > 1 {
568            let max_edges = graph.node_count() * (graph.node_count() - 1);
569            graph.edge_count() as f64 / max_edges as f64
570        } else {
571            0.0
572        };
573        
574        // Expensive computations only for smaller graphs
575        let (average_clustering, global_clustering, average_path_length, diameter) = 
576            if self.config.compute_expensive_metrics && graph.node_count() <= self.config.max_nodes_for_expensive_ops {
577                (
578                    Some(self.estimate_average_clustering(graph)),
579                    Some(self.estimate_global_clustering(graph)),
580                    self.estimate_average_path_length(graph),
581                    self.estimate_diameter(graph),
582                )
583            } else {
584                (None, None, None, None)
585            };
586        
587        let is_acyclic = self.estimate_is_acyclic(graph);
588        
589        Ok(ConnectivityAnalysis {
590            weakly_connected_components,
591            strongly_connected_components,
592            largest_scc_size,
593            graph_density,
594            average_clustering: average_clustering.unwrap_or(0.0),
595            global_clustering: global_clustering.unwrap_or(0.0),
596            average_path_length,
597            diameter,
598            is_acyclic,
599        })
600    }
601    
602    /// Estimate size of largest strongly connected component
603    fn estimate_largest_scc_size(&self, graph: &DependencyGraph) -> usize {
604        // Simplified estimation: find node with highest product of in/out degrees
605        graph.nodes()
606            .map(|node| std::cmp::min(graph.in_degree(node) + 1, graph.out_degree(node) + 1))
607            .max()
608            .unwrap_or(0)
609    }
610    
611    /// Estimate average clustering coefficient
612    fn estimate_average_clustering(&self, graph: &DependencyGraph) -> f64 {
613        let clustering_coefficients: Vec<f64> = graph.nodes()
614            .map(|node| self.local_clustering_coefficient(graph, node))
615            .collect();
616        
617        if clustering_coefficients.is_empty() {
618            0.0
619        } else {
620            clustering_coefficients.iter().sum::<f64>() / clustering_coefficients.len() as f64
621        }
622    }
623    
624    /// Compute local clustering coefficient for a node
625    fn local_clustering_coefficient(&self, graph: &DependencyGraph, node: &NodeId) -> f64 {
626        let neighbors = graph.all_neighbors(node);
627        let k = neighbors.len();
628        
629        if k < 2 {
630            return 0.0;
631        }
632        
633        // Count edges between neighbors
634        let mut edges_between_neighbors = 0;
635        let neighbor_vec: Vec<_> = neighbors.iter().collect();
636        
637        for i in 0..neighbor_vec.len() {
638            for j in (i + 1)..neighbor_vec.len() {
639                if graph.contains_edge(neighbor_vec[i], neighbor_vec[j]) ||
640                   graph.contains_edge(neighbor_vec[j], neighbor_vec[i]) {
641                    edges_between_neighbors += 1;
642                }
643            }
644        }
645        
646        let max_possible_edges = k * (k - 1) / 2;
647        edges_between_neighbors as f64 / max_possible_edges as f64
648    }
649    
650    /// Estimate global clustering coefficient (transitivity)
651    fn estimate_global_clustering(&self, graph: &DependencyGraph) -> f64 {
652        let mut triangles = 0;
653        let mut triplets = 0;
654        
655        for node in graph.nodes() {
656            let neighbors = graph.all_neighbors(node);
657            if neighbors.len() < 2 {
658                continue;
659            }
660            
661            let neighbor_vec: Vec<_> = neighbors.iter().collect();
662            
663            for i in 0..neighbor_vec.len() {
664                for j in (i + 1)..neighbor_vec.len() {
665                    triplets += 1;
666                    if graph.contains_edge(neighbor_vec[i], neighbor_vec[j]) ||
667                       graph.contains_edge(neighbor_vec[j], neighbor_vec[i]) {
668                        triangles += 1;
669                    }
670                }
671            }
672        }
673        
674        if triplets > 0 {
675            3.0 * triangles as f64 / triplets as f64
676        } else {
677            0.0
678        }
679    }
680    
681    /// Estimate average path length using sampling
682    fn estimate_average_path_length(&self, graph: &DependencyGraph) -> Option<f64> {
683        let nodes: Vec<_> = graph.nodes().collect();
684        if nodes.len() < 2 {
685            return None;
686        }
687        
688        let sample_size = std::cmp::min(100, nodes.len());
689        let mut total_path_length = 0.0;
690        let mut valid_paths = 0;
691        
692        for i in 0..sample_size {
693            let start_node = &nodes[i % nodes.len()];
694            let distances = self.bfs_distances(graph, start_node);
695            
696            for distance in distances.values() {
697                if *distance > 0 && *distance < usize::MAX {
698                    total_path_length += *distance as f64;
699                    valid_paths += 1;
700                }
701            }
702        }
703        
704        if valid_paths > 0 {
705            Some(total_path_length / valid_paths as f64)
706        } else {
707            None
708        }
709    }
710    
711    /// Estimate graph diameter
712    fn estimate_diameter(&self, graph: &DependencyGraph) -> Option<usize> {
713        let nodes: Vec<_> = graph.nodes().collect();
714        if nodes.is_empty() {
715            return None;
716        }
717        
718        let mut max_distance = 0;
719        let sample_size = std::cmp::min(20, nodes.len());
720        
721        for i in 0..sample_size {
722            let start_node = &nodes[i % nodes.len()];
723            let distances = self.bfs_distances(graph, start_node);
724            
725            for &distance in distances.values() {
726                if distance != usize::MAX && distance > max_distance {
727                    max_distance = distance;
728                }
729            }
730        }
731        
732        if max_distance > 0 { Some(max_distance) } else { None }
733    }
734    
735    /// BFS to compute distances from a source node
736    fn bfs_distances(&self, graph: &DependencyGraph, source: &NodeId) -> HashMap<NodeId, usize> {
737        let mut distances = HashMap::new();
738        let mut queue = VecDeque::new();
739        
740        distances.insert(source.clone(), 0);
741        queue.push_back(source.clone());
742        
743        while let Some(current) = queue.pop_front() {
744            let current_distance = distances[&current];
745            
746            // Visit both outgoing and incoming neighbors (undirected traversal)
747            if let Some(outgoing) = graph.outgoing_neighbors(&current) {
748                for neighbor in outgoing {
749                    if !distances.contains_key(neighbor) {
750                        distances.insert(neighbor.clone(), current_distance + 1);
751                        queue.push_back(neighbor.clone());
752                    }
753                }
754            }
755            
756            if let Some(incoming) = graph.incoming_neighbors(&current) {
757                for neighbor in incoming {
758                    if !distances.contains_key(neighbor) {
759                        distances.insert(neighbor.clone(), current_distance + 1);
760                        queue.push_back(neighbor.clone());
761                    }
762                }
763            }
764        }
765        
766        distances
767    }
768    
769    /// Check if graph is likely acyclic (DAG)
770    fn estimate_is_acyclic(&self, graph: &DependencyGraph) -> bool {
771        // Simple heuristic: if there are very few nodes with both in and out edges, likely acyclic
772        let nodes_with_both = graph.nodes()
773            .filter(|&node| graph.in_degree(node) > 0 && graph.out_degree(node) > 0)
774            .count();
775        
776        let total_nodes = graph.node_count();
777        if total_nodes == 0 {
778            return true;
779        }
780        
781        let bidirectional_ratio = nodes_with_both as f64 / total_nodes as f64;
782        bidirectional_ratio < 0.3 // Heuristic threshold
783    }
784    
785    /// Identify structural patterns in the graph
786    fn identify_structural_patterns(&self, graph: &DependencyGraph) -> Result<StructuralPatterns> {
787        // Find hubs (high out-degree)
788        let mut hub_candidates: Vec<_> = graph.nodes()
789            .map(|node| NodeInfo {
790                node_id: node.clone(),
791                score: graph.out_degree(node) as f64,
792                in_degree: graph.in_degree(node),
793                out_degree: graph.out_degree(node),
794                metadata: None,
795            })
796            .collect();
797        hub_candidates.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap_or(std::cmp::Ordering::Equal));
798        let hubs = hub_candidates.into_iter().take(self.config.top_nodes_count).collect();
799        
800        // Find authorities (high in-degree)
801        let mut authority_candidates: Vec<_> = graph.nodes()
802            .map(|node| NodeInfo {
803                node_id: node.clone(),
804                score: graph.in_degree(node) as f64,
805                in_degree: graph.in_degree(node),
806                out_degree: graph.out_degree(node),
807                metadata: None,
808            })
809            .collect();
810        authority_candidates.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap_or(std::cmp::Ordering::Equal));
811        let authorities = authority_candidates.into_iter().take(self.config.top_nodes_count).collect();
812        
813        // Identify special node types
814        let isolated_nodes: Vec<_> = graph.nodes()
815            .filter(|&node| graph.degree(node) == 0)
816            .cloned()
817            .collect();
818        
819        let dangling_nodes = graph.dangling_nodes().into_iter().cloned().collect();
820        
821        let leaf_nodes: Vec<_> = graph.nodes()
822            .filter(|&node| graph.in_degree(node) == 0 && graph.out_degree(node) > 0)
823            .cloned()
824            .collect();
825        
826        // TODO: Implement betweenness centrality for bottlenecks and bridges
827        let bottlenecks = Vec::new();
828        let bridges = Vec::new();
829        
830        Ok(StructuralPatterns {
831            hubs,
832            authorities,
833            bottlenecks,
834            bridges,
835            isolated_nodes,
836            dangling_nodes,
837            leaf_nodes,
838        })
839    }
840    
841    /// Analyze import-specific patterns
842    fn analyze_import_patterns(&self, graph: &DependencyGraph) -> Result<ImportInsights> {
843        let fan_out_stats = {
844            let out_degrees: Vec<_> = graph.nodes().map(|node| graph.out_degree(node)).collect();
845            self.compute_degree_stats(&out_degrees)
846        };
847        
848        let fan_in_stats = {
849            let in_degrees: Vec<_> = graph.nodes().map(|node| graph.in_degree(node)).collect();
850            self.compute_degree_stats(&in_degrees)
851        };
852        
853        // TODO: Implement sophisticated cycle detection and dependency layering
854        let circular_dependencies = Vec::new();
855        let dependency_layers = Vec::new();
856        let critical_paths = Vec::new();
857        
858        // Simple depth estimation
859        let max_import_depth = if let Some(diameter) = self.estimate_diameter(graph) {
860            diameter
861        } else {
862            0
863        };
864        
865        let average_import_depth = if let Some(avg_path) = self.estimate_average_path_length(graph) {
866            avg_path
867        } else {
868            0.0
869        };
870        
871        Ok(ImportInsights {
872            average_import_depth,
873            max_import_depth,
874            fan_out_distribution: fan_out_stats,
875            fan_in_distribution: fan_in_stats,
876            circular_dependencies,
877            dependency_layers,
878            critical_paths,
879        })
880    }
881    
882    /// Estimate performance characteristics for various graph algorithms
883    fn estimate_performance_characteristics(&self, graph: &DependencyGraph) -> PerformanceProfile {
884        let n = graph.node_count();
885        let m = graph.edge_count();
886        
887        // PageRank memory estimate (2 score vectors + graph overhead)
888        let pagerank_memory_mb = if n > 0 {
889            let score_vector_size = n * std::mem::size_of::<f64>();
890            let graph_overhead = m * (std::mem::size_of::<String>() + std::mem::size_of::<usize>());
891            ((score_vector_size * 2 + graph_overhead) as f64) / (1024.0 * 1024.0)
892        } else {
893            0.0
894        };
895        
896        // Rough PageRank time estimate (based on empirical observations)
897        let pagerank_time_estimate_ms = if n > 0 {
898            // Base time + iterations * (nodes + edges)
899            let base_time = 1;
900            let per_iteration_time = (n + m) / 10000; // Rough estimate
901            let estimated_iterations = if n < 1000 { 10 } else { 20 };
902            base_time + estimated_iterations * per_iteration_time.max(1)
903        } else {
904            0
905        } as u64;
906        
907        let traversal_complexity = TraversalComplexity {
908            time_complexity_class: format!("O(V + E) = O({} + {})", n, m),
909            space_complexity_class: format!("O(V) = O({})", n),
910            expected_iterations: if n < 1000 { 10 } else { 20 },
911        };
912        
913        let edges_per_node = if n > 0 { m as f64 / n as f64 } else { 0.0 };
914        let max_possible_edges = if n > 1 { n * (n - 1) } else { 1 };
915        let sparsity = 1.0 - (m as f64 / max_possible_edges as f64);
916        
917        let storage_efficiency = StorageEfficiency {
918            adjacency_list_size_bytes: m * (std::mem::size_of::<String>() + std::mem::size_of::<usize>()),
919            edges_per_node,
920            memory_overhead_ratio: 1.5, // Rough estimate for hash map overhead
921            sparsity,
922        };
923        
924        PerformanceProfile {
925            pagerank_memory_estimate_mb: pagerank_memory_mb,
926            pagerank_time_estimate_ms,
927            traversal_complexity,
928            storage_efficiency,
929        }
930    }
931}
932
933impl Default for GraphStatisticsAnalyzer {
934    fn default() -> Self {
935        Self::new()
936    }
937}
938
939/// Utility functions for graph analysis results
940impl GraphAnalysisResults {
941    /// Generate a comprehensive summary report
942    pub fn summary_report(&self) -> String {
943        format!(
944            "Graph Analysis Summary Report\n\
945             ============================\n\
946             \n\
947             Basic Statistics:\n\
948             - Nodes: {} | Edges: {} | Density: {:.4}\n\
949             - Average in-degree: {:.2} | Average out-degree: {:.2}\n\
950             - Strongly connected components: {}\n\
951             \n\
952             Degree Distribution:\n\
953             - In-degree range: [{}, {}] (mean: {:.2}, std: {:.2})\n\
954             - Out-degree range: [{}, {}] (mean: {:.2}, std: {:.2})\n\
955             - Power-law alpha: {}\n\
956             \n\
957             Connectivity:\n\
958             - Average clustering: {:.4}\n\
959             - Average path length: {}\n\
960             - Diameter: {} | Is acyclic: {}\n\
961             \n\
962             Structural Patterns:\n\
963             - Top hubs: {} | Top authorities: {}\n\
964             - Isolated nodes: {} | Dangling nodes: {} | Leaf nodes: {}\n\
965             \n\
966             Import Insights:\n\
967             - Average import depth: {:.2} | Max depth: {}\n\
968             - Circular dependencies: {}\n\
969             \n\
970             Performance Profile:\n\
971             - PageRank memory estimate: {:.1} MB\n\
972             - PageRank time estimate: {} ms\n\
973             - Graph sparsity: {:.4}\n\
974             \n\
975             Analysis completed in {} ms (parallel: {})",
976            self.basic_stats.total_nodes,
977            self.basic_stats.total_edges,
978            self.basic_stats.graph_density,
979            self.basic_stats.in_degree_avg,
980            self.basic_stats.out_degree_avg,
981            self.basic_stats.strongly_connected_components,
982            
983            self.degree_distribution.in_degree.min,
984            self.degree_distribution.in_degree.max,
985            self.degree_distribution.in_degree.mean,
986            self.degree_distribution.in_degree.std_dev,
987            self.degree_distribution.out_degree.min,
988            self.degree_distribution.out_degree.max,
989            self.degree_distribution.out_degree.mean,
990            self.degree_distribution.out_degree.std_dev,
991            self.degree_distribution.power_law_alpha.map_or("N/A".to_string(), |a| format!("{:.3}", a)),
992            
993            self.connectivity.average_clustering,
994            self.connectivity.average_path_length.map_or("N/A".to_string(), |l| format!("{:.2}", l)),
995            self.connectivity.diameter.map_or("N/A".to_string(), |d| d.to_string()),
996            self.connectivity.is_acyclic,
997            
998            self.structural_patterns.hubs.len(),
999            self.structural_patterns.authorities.len(),
1000            self.structural_patterns.isolated_nodes.len(),
1001            self.structural_patterns.dangling_nodes.len(),
1002            self.structural_patterns.leaf_nodes.len(),
1003            
1004            self.import_insights.average_import_depth,
1005            self.import_insights.max_import_depth,
1006            self.import_insights.circular_dependencies.len(),
1007            
1008            self.performance_profile.pagerank_memory_estimate_mb,
1009            self.performance_profile.pagerank_time_estimate_ms,
1010            self.performance_profile.storage_efficiency.sparsity,
1011            
1012            self.analysis_metadata.analysis_duration_ms,
1013            self.analysis_metadata.used_parallel,
1014        )
1015    }
1016    
1017    /// Get a list of the most important nodes by various metrics
1018    pub fn important_nodes_summary(&self) -> Vec<(String, Vec<String>)> {
1019        vec![
1020            (
1021                "Top Hubs (High Out-degree)".to_string(),
1022                self.structural_patterns.hubs.iter()
1023                    .map(|node| format!("{} (out: {})", node.node_id, node.out_degree))
1024                    .collect(),
1025            ),
1026            (
1027                "Top Authorities (High In-degree)".to_string(),
1028                self.structural_patterns.authorities.iter()
1029                    .map(|node| format!("{} (in: {})", node.node_id, node.in_degree))
1030                    .collect(),
1031            ),
1032        ]
1033    }
1034}
1035
1036#[cfg(test)]
1037mod tests {
1038    use super::*;
1039    use crate::graph::DependencyGraph;
1040    
1041    fn create_test_graph() -> DependencyGraph {
1042        let mut graph = DependencyGraph::new();
1043        
1044        // Create a more complex graph for testing
1045        graph.add_edge("main.py".to_string(), "utils.py".to_string()).unwrap();
1046        graph.add_edge("main.py".to_string(), "config.py".to_string()).unwrap();
1047        graph.add_edge("utils.py".to_string(), "config.py".to_string()).unwrap();
1048        graph.add_edge("test.py".to_string(), "main.py".to_string()).unwrap();
1049        graph.add_edge("test.py".to_string(), "utils.py".to_string()).unwrap();
1050        graph.add_node("isolated.py".to_string()).unwrap();
1051        
1052        graph
1053    }
1054    
1055    #[test]
1056    fn test_statistics_analyzer_creation() {
1057        let analyzer = GraphStatisticsAnalyzer::new();
1058        assert!(analyzer.config.compute_expensive_metrics);
1059        
1060        let large_graph_analyzer = GraphStatisticsAnalyzer::for_large_graphs();
1061        assert!(!large_graph_analyzer.config.compute_expensive_metrics);
1062    }
1063    
1064    #[test]
1065    fn test_basic_statistics() {
1066        let graph = create_test_graph();
1067        let analyzer = GraphStatisticsAnalyzer::new();
1068        
1069        let stats = analyzer.compute_basic_statistics(&graph);
1070        
1071        println!("Actual node count: {}, Expected: 6", stats.total_nodes);
1072        assert_eq!(stats.total_nodes, 5); // Fixed: should be 5 nodes (main.py, utils.py, config.py, test.py, isolated.py)
1073        assert_eq!(stats.total_edges, 5);
1074        assert!(stats.graph_density > 0.0);
1075        assert!(stats.graph_density < 1.0);
1076        assert_eq!(stats.isolated_nodes, 1); // isolated.py
1077    }
1078    
1079    #[test]
1080    fn test_degree_statistics() {
1081        let analyzer = GraphStatisticsAnalyzer::new();
1082        let degrees = vec![0, 1, 1, 2, 3, 5, 8];
1083        
1084        let stats = analyzer.compute_degree_stats(&degrees);
1085        
1086        assert_eq!(stats.min, 0);
1087        assert_eq!(stats.max, 8);
1088        assert!((stats.mean - 2.857).abs() < 0.01); // 20/7
1089        assert_eq!(stats.median, 2.0);
1090        assert!(stats.std_dev > 0.0);
1091    }
1092    
1093    #[test]
1094    fn test_percentile_calculation() {
1095        let analyzer = GraphStatisticsAnalyzer::new();
1096        let sorted_values = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
1097        
1098        assert_eq!(analyzer.percentile(&sorted_values, 0.0), 1.0);
1099        assert_eq!(analyzer.percentile(&sorted_values, 50.0), 5.5);
1100        assert_eq!(analyzer.percentile(&sorted_values, 100.0), 10.0);
1101        assert!((analyzer.percentile(&sorted_values, 25.0) - 3.25).abs() < 0.01);
1102    }
1103    
1104    #[test]
1105    fn test_histogram_computation() {
1106        let analyzer = GraphStatisticsAnalyzer::new();
1107        let degrees = vec![1, 1, 2, 2, 2, 3];
1108        
1109        let histogram = analyzer.compute_histogram(&degrees);
1110        
1111        assert_eq!(histogram[&1], 2);
1112        assert_eq!(histogram[&2], 3);
1113        assert_eq!(histogram[&3], 1);
1114    }
1115    
1116    #[test]
1117    fn test_power_law_estimation() {
1118        let analyzer = GraphStatisticsAnalyzer::new();
1119        
1120        // Power-law-like distribution
1121        let degrees = vec![1, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20];
1122        let alpha = analyzer.estimate_power_law_alpha(&degrees);
1123        
1124        assert!(alpha.is_some());
1125        let alpha_val = alpha.unwrap();
1126        assert!(alpha_val > 0.0);
1127        assert!(alpha_val < 5.0); // Reasonable range
1128        
1129        println!("Estimated power-law alpha: {:.3}", alpha_val);
1130    }
1131    
1132    #[test]
1133    fn test_clustering_coefficient() {
1134        let mut graph = DependencyGraph::new();
1135        
1136        // Create a triangle: A -> B, B -> C, C -> A
1137        graph.add_edge("A".to_string(), "B".to_string()).unwrap();
1138        graph.add_edge("B".to_string(), "C".to_string()).unwrap();
1139        graph.add_edge("C".to_string(), "A".to_string()).unwrap();
1140        
1141        let analyzer = GraphStatisticsAnalyzer::new();
1142        let clustering = analyzer.local_clustering_coefficient(&graph, &"A".to_string());
1143        
1144        // Should be high since A's neighbors (B and C) are connected
1145        assert!(clustering >= 0.0);
1146        assert!(clustering <= 1.0);
1147        println!("Clustering coefficient for A: {:.3}", clustering);
1148    }
1149    
1150    #[test]
1151    fn test_bfs_distances() {
1152        let graph = create_test_graph();
1153        let analyzer = GraphStatisticsAnalyzer::new();
1154        
1155        let distances = analyzer.bfs_distances(&graph, &"main.py".to_string());
1156        
1157        assert_eq!(distances["main.py"], 0);
1158        assert!(distances.contains_key("utils.py"));
1159        assert!(distances.contains_key("config.py"));
1160        assert!(distances["utils.py"] <= distances["config.py"]);
1161    }
1162    
1163    #[test]
1164    fn test_structural_patterns() {
1165        let graph = create_test_graph();
1166        let analyzer = GraphStatisticsAnalyzer::new();
1167        
1168        let patterns = analyzer.identify_structural_patterns(&graph).unwrap();
1169        
1170        // main.py should be a hub (high out-degree)
1171        let main_py_hub = patterns.hubs.iter().find(|node| node.node_id == "main.py");
1172        assert!(main_py_hub.is_some());
1173        
1174        // config.py should be an authority (high in-degree)
1175        let config_py_auth = patterns.authorities.iter().find(|node| node.node_id == "config.py");
1176        assert!(config_py_auth.is_some());
1177        
1178        // isolated.py should be in isolated nodes
1179        assert!(patterns.isolated_nodes.contains(&"isolated.py".to_string()));
1180    }
1181    
1182    #[test]
1183    fn test_full_analysis() {
1184        let graph = create_test_graph();
1185        let analyzer = GraphStatisticsAnalyzer::new();
1186        
1187        let analysis = analyzer.analyze(&graph).unwrap();
1188        
1189        // Basic checks
1190        assert_eq!(analysis.basic_stats.total_nodes, 5); // Fixed: should be 5 nodes
1191        assert_eq!(analysis.basic_stats.total_edges, 5);
1192        
1193        // Degree distribution
1194        assert!(analysis.degree_distribution.in_degree.mean >= 0.0);
1195        assert!(analysis.degree_distribution.out_degree.mean >= 0.0);
1196        
1197        // Connectivity
1198        assert!(analysis.connectivity.graph_density >= 0.0);
1199        assert!(analysis.connectivity.graph_density <= 1.0);
1200        
1201        // Performance profile
1202        assert!(analysis.performance_profile.pagerank_memory_estimate_mb >= 0.0);
1203        assert!(analysis.performance_profile.pagerank_time_estimate_ms >= 0);
1204        
1205        // Analysis metadata (duration may be 0 on fast systems)
1206        assert!(analysis.analysis_metadata.analysis_duration_ms >= 0);
1207        assert_eq!(analysis.analysis_metadata.version, env!("CARGO_PKG_VERSION"));
1208    }
1209    
1210    #[test]
1211    fn test_summary_report() {
1212        let graph = create_test_graph();
1213        let analyzer = GraphStatisticsAnalyzer::new();
1214        let analysis = analyzer.analyze(&graph).unwrap();
1215        
1216        let summary = analysis.summary_report();
1217        
1218        assert!(summary.contains("Graph Analysis Summary Report"));
1219        assert!(summary.contains("Basic Statistics"));
1220        assert!(summary.contains("Degree Distribution"));
1221        assert!(summary.contains("Connectivity"));
1222        assert!(summary.contains("Structural Patterns"));
1223        assert!(summary.contains("Import Insights"));
1224        assert!(summary.contains("Performance Profile"));
1225        
1226        println!("Summary Report:\n{}", summary);
1227    }
1228    
1229    #[test]
1230    fn test_important_nodes_summary() {
1231        let graph = create_test_graph();
1232        let analyzer = GraphStatisticsAnalyzer::new();
1233        let analysis = analyzer.analyze(&graph).unwrap();
1234        
1235        let important_nodes = analysis.important_nodes_summary();
1236        
1237        assert!(!important_nodes.is_empty());
1238        assert_eq!(important_nodes.len(), 2); // Hubs and Authorities
1239        
1240        for (category, nodes) in &important_nodes {
1241            println!("{}: {:?}", category, nodes);
1242        }
1243    }
1244    
1245    #[test]
1246    fn test_empty_graph_analysis() {
1247        let graph = DependencyGraph::new();
1248        let analyzer = GraphStatisticsAnalyzer::new();
1249        
1250        let analysis = analyzer.analyze(&graph).unwrap();
1251        
1252        assert_eq!(analysis.basic_stats.total_nodes, 0);
1253        assert_eq!(analysis.basic_stats.total_edges, 0);
1254        assert_eq!(analysis.degree_distribution.in_degree.mean, 0.0);
1255        assert!(analysis.structural_patterns.hubs.is_empty());
1256        assert!(analysis.structural_patterns.authorities.is_empty());
1257    }
1258}