scribe_graph/
statistics.rs

1//! # Graph Statistics and Analysis for Dependency Graphs
2//!
3//! Comprehensive statistical analysis and metrics computation for code dependency graphs.
4//! This module provides detailed insights into graph structure, connectivity patterns,
5//! and characteristics that are essential for understanding codebase architecture.
6//!
7//! ## Key Features
8//! - **Degree Distribution Analysis**: In-degree, out-degree, and total degree statistics
9//! - **Connectivity Metrics**: Strongly connected components, graph density, clustering
10//! - **Structural Patterns**: Hub detection, authority identification, bottleneck analysis
11//! - **Import Relationship Insights**: Dependency depth, fan-in/fan-out analysis
12//! - **Performance Characteristics**: Memory usage, computation complexity estimates
13//! - **Comparative Analysis**: Before/after graph evolution tracking
14
15use scribe_core::Result;
16use serde::{Deserialize, Serialize};
17use std::collections::{HashMap, VecDeque};
18
19use crate::graph::{DependencyGraph, GraphStatistics, NodeId};
20
21/// Comprehensive graph analysis results
22#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
23pub struct GraphAnalysisResults {
24    /// Basic graph statistics
25    pub basic_stats: GraphStatistics,
26
27    /// Degree distribution analysis
28    pub degree_distribution: DegreeDistribution,
29
30    /// Connectivity analysis
31    pub connectivity: ConnectivityAnalysis,
32
33    /// Structural patterns
34    pub structural_patterns: StructuralPatterns,
35
36    /// Import relationship insights
37    pub import_insights: ImportInsights,
38
39    /// Performance characteristics
40    pub performance_profile: PerformanceProfile,
41
42    /// Analysis metadata
43    pub analysis_metadata: AnalysisMetadata,
44}
45
46/// Detailed degree distribution statistics
47#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
48pub struct DegreeDistribution {
49    /// In-degree statistics
50    pub in_degree: DegreeStats,
51
52    /// Out-degree statistics  
53    pub out_degree: DegreeStats,
54
55    /// Total degree statistics
56    pub total_degree: DegreeStats,
57
58    /// Degree distribution histogram (degree -> count)
59    pub in_degree_histogram: HashMap<usize, usize>,
60    pub out_degree_histogram: HashMap<usize, usize>,
61
62    /// Power-law fitting parameters (if applicable)
63    pub power_law_alpha: Option<f64>,
64    pub power_law_goodness_of_fit: Option<f64>,
65}
66
67/// Statistical measures for degree sequences
68#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
69pub struct DegreeStats {
70    pub min: usize,
71    pub max: usize,
72    pub mean: f64,
73    pub median: f64,
74    pub std_dev: f64,
75    pub percentile_25: f64,
76    pub percentile_75: f64,
77    pub percentile_90: f64,
78    pub percentile_95: f64,
79    pub percentile_99: f64,
80}
81
82/// Graph connectivity and component analysis
83#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
84pub struct ConnectivityAnalysis {
85    /// Number of weakly connected components
86    pub weakly_connected_components: usize,
87
88    /// Number of strongly connected components
89    pub strongly_connected_components: usize,
90
91    /// Size of largest strongly connected component
92    pub largest_scc_size: usize,
93
94    /// Graph density (actual edges / possible edges)
95    pub graph_density: f64,
96
97    /// Average clustering coefficient
98    pub average_clustering: f64,
99
100    /// Global clustering coefficient (transitivity)
101    pub global_clustering: f64,
102
103    /// Average path length (in largest component)
104    pub average_path_length: Option<f64>,
105
106    /// Graph diameter (longest shortest path)
107    pub diameter: Option<usize>,
108
109    /// Is the graph acyclic (DAG)
110    pub is_acyclic: bool,
111}
112
113/// Structural patterns and notable nodes
114#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
115pub struct StructuralPatterns {
116    /// Hub nodes (high out-degree)
117    pub hubs: Vec<NodeInfo>,
118
119    /// Authority nodes (high in-degree)
120    pub authorities: Vec<NodeInfo>,
121
122    /// Bottleneck nodes (high betweenness centrality estimate)
123    pub bottlenecks: Vec<NodeInfo>,
124
125    /// Bridge nodes (critical for connectivity)
126    pub bridges: Vec<NodeInfo>,
127
128    /// Isolated nodes (no connections)
129    pub isolated_nodes: Vec<NodeId>,
130
131    /// Dangling nodes (no outgoing edges)
132    pub dangling_nodes: Vec<NodeId>,
133
134    /// Leaf nodes (no incoming edges, but have outgoing)
135    pub leaf_nodes: Vec<NodeId>,
136}
137
138/// Information about important nodes
139#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
140pub struct NodeInfo {
141    pub node_id: NodeId,
142    pub score: f64,
143    pub in_degree: usize,
144    pub out_degree: usize,
145    pub metadata: Option<String>, // Additional context
146}
147
148/// Import relationship specific insights
149#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
150pub struct ImportInsights {
151    /// Average import depth (distance from root nodes)
152    pub average_import_depth: f64,
153
154    /// Maximum import depth
155    pub max_import_depth: usize,
156
157    /// Import fan-out distribution (how many files each file imports)
158    pub fan_out_distribution: DegreeStats,
159
160    /// Import fan-in distribution (how many files import each file)
161    pub fan_in_distribution: DegreeStats,
162
163    /// Circular dependency detection
164    pub circular_dependencies: Vec<CircularDependency>,
165
166    /// Dependency layers (topological levels)
167    pub dependency_layers: Vec<Vec<NodeId>>,
168
169    /// Critical import paths (most important dependency chains)
170    pub critical_paths: Vec<DependencyPath>,
171}
172
173/// Circular dependency information
174#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
175pub struct CircularDependency {
176    pub nodes: Vec<NodeId>,
177    pub cycle_length: usize,
178    pub strength: f64, // How many edges form the cycle
179}
180
181/// Important dependency path
182#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
183pub struct DependencyPath {
184    pub path: Vec<NodeId>,
185    pub length: usize,
186    pub importance_score: f64,
187}
188
189/// Performance characteristics of the graph
190#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
191pub struct PerformanceProfile {
192    /// Estimated memory usage for PageRank computation
193    pub pagerank_memory_estimate_mb: f64,
194
195    /// Estimated PageRank computation time
196    pub pagerank_time_estimate_ms: u64,
197
198    /// Graph traversal complexity
199    pub traversal_complexity: TraversalComplexity,
200
201    /// Storage efficiency metrics
202    pub storage_efficiency: StorageEfficiency,
203}
204
205/// Complexity analysis for graph algorithms
206#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
207pub struct TraversalComplexity {
208    /// BFS/DFS time complexity class
209    pub time_complexity_class: String,
210
211    /// Space complexity class
212    pub space_complexity_class: String,
213
214    /// Expected iterations for convergence algorithms
215    pub expected_iterations: usize,
216}
217
218/// Storage and memory efficiency metrics
219#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
220pub struct StorageEfficiency {
221    /// Adjacency list memory usage (bytes)
222    pub adjacency_list_size_bytes: usize,
223
224    /// Average edges per node
225    pub edges_per_node: f64,
226
227    /// Memory overhead ratio
228    pub memory_overhead_ratio: f64,
229
230    /// Sparsity coefficient
231    pub sparsity: f64,
232}
233
234/// Analysis execution metadata
235#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
236pub struct AnalysisMetadata {
237    /// When the analysis was performed
238    pub timestamp: chrono::DateTime<chrono::Utc>,
239
240    /// Analysis duration in milliseconds
241    pub analysis_duration_ms: u64,
242
243    /// Whether parallel computation was used
244    pub used_parallel: bool,
245
246    /// Analysis configuration used
247    pub config: StatisticsConfig,
248
249    /// Version of the statistics module
250    pub version: String,
251}
252
253impl Default for AnalysisMetadata {
254    fn default() -> Self {
255        Self {
256            timestamp: chrono::Utc::now(),
257            analysis_duration_ms: 0,
258            used_parallel: false,
259            config: StatisticsConfig::default(),
260            version: "1.0.0".to_string(),
261        }
262    }
263}
264
265/// Configuration for statistics computation
266#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
267pub struct StatisticsConfig {
268    /// Whether to compute expensive metrics (clustering, path length)
269    pub compute_expensive_metrics: bool,
270
271    /// Whether to use parallel computation where possible
272    pub use_parallel: bool,
273
274    /// Maximum number of nodes to analyze for expensive operations
275    pub max_nodes_for_expensive_ops: usize,
276
277    /// Number of top nodes to include in structural patterns
278    pub top_nodes_count: usize,
279
280    /// Minimum score threshold for pattern detection
281    pub pattern_threshold: f64,
282}
283
284impl Default for StatisticsConfig {
285    fn default() -> Self {
286        Self {
287            compute_expensive_metrics: true,
288            use_parallel: true,
289            max_nodes_for_expensive_ops: 10000,
290            top_nodes_count: 10,
291            pattern_threshold: 0.1,
292        }
293    }
294}
295
296/// Main graph statistics analyzer
297#[derive(Debug)]
298pub struct GraphStatisticsAnalyzer {
299    config: StatisticsConfig,
300}
301
302impl GraphStatisticsAnalyzer {
303    /// Create a new analyzer with default configuration
304    pub fn new() -> Self {
305        Self {
306            config: StatisticsConfig::default(),
307        }
308    }
309
310    /// Create with custom configuration
311    pub fn with_config(config: StatisticsConfig) -> Self {
312        Self { config }
313    }
314
315    /// Create analyzer optimized for large graphs
316    pub fn for_large_graphs() -> Self {
317        Self {
318            config: StatisticsConfig {
319                compute_expensive_metrics: false,
320                max_nodes_for_expensive_ops: 5000,
321                top_nodes_count: 5,
322                ..StatisticsConfig::default()
323            },
324        }
325    }
326
327    /// Perform comprehensive analysis of the dependency graph
328    pub fn analyze(&self, graph: &DependencyGraph) -> Result<GraphAnalysisResults> {
329        let start_time = std::time::Instant::now();
330
331        // Basic statistics
332        let basic_stats = self.compute_basic_statistics(graph);
333
334        // Degree distribution analysis
335        let degree_distribution = self.analyze_degree_distribution(graph)?;
336
337        // Connectivity analysis
338        let connectivity = self.analyze_connectivity(graph)?;
339
340        // Structural patterns
341        let structural_patterns = self.identify_structural_patterns(graph)?;
342
343        // Import-specific insights
344        let import_insights = self.analyze_import_patterns(graph)?;
345
346        // Performance characteristics
347        let performance_profile = self.estimate_performance_characteristics(graph);
348
349        // Analysis metadata
350        let analysis_metadata = AnalysisMetadata {
351            timestamp: chrono::Utc::now(),
352            analysis_duration_ms: start_time.elapsed().as_millis() as u64,
353            used_parallel: self.config.use_parallel,
354            config: self.config.clone(),
355            version: env!("CARGO_PKG_VERSION").to_string(),
356        };
357
358        Ok(GraphAnalysisResults {
359            basic_stats,
360            degree_distribution,
361            connectivity,
362            structural_patterns,
363            import_insights,
364            performance_profile,
365            analysis_metadata,
366        })
367    }
368
369    /// Compute basic graph statistics
370    fn compute_basic_statistics(&self, graph: &DependencyGraph) -> GraphStatistics {
371        let total_nodes = graph.node_count();
372        let total_edges = graph.edge_count();
373
374        if total_nodes == 0 {
375            return GraphStatistics::empty();
376        }
377
378        // Compute degree statistics
379        let degrees: Vec<_> = graph
380            .nodes()
381            .map(|node| (graph.in_degree(node), graph.out_degree(node)))
382            .collect();
383
384        let in_degrees: Vec<_> = degrees.iter().map(|(in_deg, _)| *in_deg).collect();
385        let out_degrees: Vec<_> = degrees.iter().map(|(_, out_deg)| *out_deg).collect();
386
387        let in_degree_avg = in_degrees.iter().sum::<usize>() as f64 / total_nodes as f64;
388        let in_degree_max = *in_degrees.iter().max().unwrap_or(&0);
389        let out_degree_avg = out_degrees.iter().sum::<usize>() as f64 / total_nodes as f64;
390        let out_degree_max = *out_degrees.iter().max().unwrap_or(&0);
391
392        // Count special node types
393        let isolated_nodes = degrees
394            .iter()
395            .filter(|(in_deg, out_deg)| *in_deg == 0 && *out_deg == 0)
396            .count();
397        let dangling_nodes = degrees.iter().filter(|(_, out_deg)| *out_deg == 0).count();
398
399        // Graph density
400        let max_possible_edges = total_nodes * (total_nodes - 1);
401        let graph_density = if max_possible_edges > 0 {
402            total_edges as f64 / max_possible_edges as f64
403        } else {
404            0.0
405        };
406
407        GraphStatistics {
408            total_nodes,
409            total_edges,
410            in_degree_avg,
411            in_degree_max,
412            out_degree_avg,
413            out_degree_max,
414            strongly_connected_components: graph.estimate_scc_count(),
415            graph_density,
416            isolated_nodes,
417            dangling_nodes,
418        }
419    }
420
421    /// Analyze degree distributions in detail
422    fn analyze_degree_distribution(&self, graph: &DependencyGraph) -> Result<DegreeDistribution> {
423        let degrees: Vec<_> = graph
424            .nodes()
425            .map(|node| (graph.in_degree(node), graph.out_degree(node)))
426            .collect();
427
428        let in_degrees: Vec<_> = degrees.iter().map(|(in_deg, _)| *in_deg).collect();
429        let out_degrees: Vec<_> = degrees.iter().map(|(_, out_deg)| *out_deg).collect();
430        let total_degrees: Vec<_> = degrees
431            .iter()
432            .map(|(in_deg, out_deg)| in_deg + out_deg)
433            .collect();
434
435        Ok(DegreeDistribution {
436            in_degree: self.compute_degree_stats(&in_degrees),
437            out_degree: self.compute_degree_stats(&out_degrees),
438            total_degree: self.compute_degree_stats(&total_degrees),
439            in_degree_histogram: self.compute_histogram(&in_degrees),
440            out_degree_histogram: self.compute_histogram(&out_degrees),
441            power_law_alpha: self.estimate_power_law_alpha(&in_degrees),
442            power_law_goodness_of_fit: None, // TODO: Implement KS test
443        })
444    }
445
446    /// Compute detailed statistics for a degree sequence
447    fn compute_degree_stats(&self, degrees: &[usize]) -> DegreeStats {
448        if degrees.is_empty() {
449            return DegreeStats {
450                min: 0,
451                max: 0,
452                mean: 0.0,
453                median: 0.0,
454                std_dev: 0.0,
455                percentile_25: 0.0,
456                percentile_75: 0.0,
457                percentile_90: 0.0,
458                percentile_95: 0.0,
459                percentile_99: 0.0,
460            };
461        }
462
463        let mut sorted_degrees = degrees.to_vec();
464        sorted_degrees.sort();
465
466        let min = sorted_degrees[0];
467        let max = sorted_degrees[sorted_degrees.len() - 1];
468        let mean = degrees.iter().sum::<usize>() as f64 / degrees.len() as f64;
469
470        // Percentiles
471        let median = self.percentile(&sorted_degrees, 50.0);
472        let percentile_25 = self.percentile(&sorted_degrees, 25.0);
473        let percentile_75 = self.percentile(&sorted_degrees, 75.0);
474        let percentile_90 = self.percentile(&sorted_degrees, 90.0);
475        let percentile_95 = self.percentile(&sorted_degrees, 95.0);
476        let percentile_99 = self.percentile(&sorted_degrees, 99.0);
477
478        // Standard deviation
479        let variance = degrees
480            .iter()
481            .map(|&deg| (deg as f64 - mean).powi(2))
482            .sum::<f64>()
483            / degrees.len() as f64;
484        let std_dev = variance.sqrt();
485
486        DegreeStats {
487            min,
488            max,
489            mean,
490            median,
491            std_dev,
492            percentile_25,
493            percentile_75,
494            percentile_90,
495            percentile_95,
496            percentile_99,
497        }
498    }
499
500    /// Calculate percentile from sorted array
501    fn percentile(&self, sorted_values: &[usize], percentile: f64) -> f64 {
502        if sorted_values.is_empty() {
503            return 0.0;
504        }
505
506        let index = (percentile / 100.0) * (sorted_values.len() - 1) as f64;
507        let lower = index.floor() as usize;
508        let upper = index.ceil() as usize;
509
510        if lower == upper {
511            sorted_values[lower] as f64
512        } else {
513            let weight = index - lower as f64;
514            (1.0 - weight) * sorted_values[lower] as f64 + weight * sorted_values[upper] as f64
515        }
516    }
517
518    /// Compute degree histogram
519    fn compute_histogram(&self, degrees: &[usize]) -> HashMap<usize, usize> {
520        let mut histogram = HashMap::new();
521        for &degree in degrees {
522            *histogram.entry(degree).or_insert(0) += 1;
523        }
524        histogram
525    }
526
527    /// Estimate power-law exponent alpha using linear regression on log-log plot
528    fn estimate_power_law_alpha(&self, degrees: &[usize]) -> Option<f64> {
529        // Filter out zeros (can't take log)
530        let non_zero_degrees: Vec<_> = degrees.iter().filter(|&&d| d > 0).collect();
531
532        if non_zero_degrees.len() < 10 {
533            return None; // Not enough data for reliable estimation
534        }
535
536        // Create log-log data points
537        let mut log_points = Vec::new();
538        let histogram =
539            self.compute_histogram(&non_zero_degrees.into_iter().copied().collect::<Vec<_>>());
540
541        for (&degree, &count) in &histogram {
542            if count > 0 {
543                log_points.push((degree as f64, count as f64));
544            }
545        }
546
547        if log_points.len() < 5 {
548            return None;
549        }
550
551        // Linear regression: log(count) = -alpha * log(degree) + C
552        let n = log_points.len() as f64;
553        let sum_log_x: f64 = log_points.iter().map(|(x, _)| x.ln()).sum();
554        let sum_log_y: f64 = log_points.iter().map(|(_, y)| y.ln()).sum();
555        let sum_log_x_log_y: f64 = log_points.iter().map(|(x, y)| x.ln() * y.ln()).sum();
556        let sum_log_x_squared: f64 = log_points.iter().map(|(x, _)| x.ln().powi(2)).sum();
557
558        let denominator = n * sum_log_x_squared - sum_log_x.powi(2);
559        if denominator.abs() < 1e-10 {
560            return None;
561        }
562
563        let alpha = -(n * sum_log_x_log_y - sum_log_x * sum_log_y) / denominator;
564
565        // Only return positive alpha (valid for power-law)
566        if alpha > 0.0 {
567            Some(alpha)
568        } else {
569            None
570        }
571    }
572
573    /// Analyze graph connectivity properties
574    fn analyze_connectivity(&self, graph: &DependencyGraph) -> Result<ConnectivityAnalysis> {
575        let strongly_connected_components = graph.estimate_scc_count();
576        let largest_scc_size = self.estimate_largest_scc_size(graph);
577
578        // For now, assume weakly connected components = strongly connected components
579        // A more sophisticated implementation would use Union-Find
580        let weakly_connected_components = strongly_connected_components;
581
582        let graph_density = if graph.node_count() > 1 {
583            let max_edges = graph.node_count() * (graph.node_count() - 1);
584            graph.edge_count() as f64 / max_edges as f64
585        } else {
586            0.0
587        };
588
589        // Expensive computations only for smaller graphs
590        let (average_clustering, global_clustering, average_path_length, diameter) =
591            if self.config.compute_expensive_metrics
592                && graph.node_count() <= self.config.max_nodes_for_expensive_ops
593            {
594                (
595                    Some(self.estimate_average_clustering(graph)),
596                    Some(self.estimate_global_clustering(graph)),
597                    self.estimate_average_path_length(graph),
598                    self.estimate_diameter(graph),
599                )
600            } else {
601                (None, None, None, None)
602            };
603
604        let is_acyclic = self.estimate_is_acyclic(graph);
605
606        Ok(ConnectivityAnalysis {
607            weakly_connected_components,
608            strongly_connected_components,
609            largest_scc_size,
610            graph_density,
611            average_clustering: average_clustering.unwrap_or(0.0),
612            global_clustering: global_clustering.unwrap_or(0.0),
613            average_path_length,
614            diameter,
615            is_acyclic,
616        })
617    }
618
619    /// Estimate size of largest strongly connected component
620    fn estimate_largest_scc_size(&self, graph: &DependencyGraph) -> usize {
621        // Simplified estimation: find node with highest product of in/out degrees
622        graph
623            .nodes()
624            .map(|node| std::cmp::min(graph.in_degree(node) + 1, graph.out_degree(node) + 1))
625            .max()
626            .unwrap_or(0)
627    }
628
629    /// Estimate average clustering coefficient
630    fn estimate_average_clustering(&self, graph: &DependencyGraph) -> f64 {
631        let clustering_coefficients: Vec<f64> = graph
632            .nodes()
633            .map(|node| self.local_clustering_coefficient(graph, node))
634            .collect();
635
636        if clustering_coefficients.is_empty() {
637            0.0
638        } else {
639            clustering_coefficients.iter().sum::<f64>() / clustering_coefficients.len() as f64
640        }
641    }
642
643    /// Compute local clustering coefficient for a node
644    fn local_clustering_coefficient(&self, graph: &DependencyGraph, node: &NodeId) -> f64 {
645        let neighbors = graph.all_neighbors(node);
646        let k = neighbors.len();
647
648        if k < 2 {
649            return 0.0;
650        }
651
652        // Count edges between neighbors
653        let mut edges_between_neighbors = 0;
654        let neighbor_vec: Vec<_> = neighbors.iter().collect();
655
656        for i in 0..neighbor_vec.len() {
657            for j in (i + 1)..neighbor_vec.len() {
658                if graph.contains_edge(neighbor_vec[i], neighbor_vec[j])
659                    || graph.contains_edge(neighbor_vec[j], neighbor_vec[i])
660                {
661                    edges_between_neighbors += 1;
662                }
663            }
664        }
665
666        let max_possible_edges = k * (k - 1) / 2;
667        edges_between_neighbors as f64 / max_possible_edges as f64
668    }
669
670    /// Estimate global clustering coefficient (transitivity)
671    fn estimate_global_clustering(&self, graph: &DependencyGraph) -> f64 {
672        let mut triangles = 0;
673        let mut triplets = 0;
674
675        for node in graph.nodes() {
676            let neighbors = graph.all_neighbors(node);
677            if neighbors.len() < 2 {
678                continue;
679            }
680
681            let neighbor_vec: Vec<_> = neighbors.iter().collect();
682
683            for i in 0..neighbor_vec.len() {
684                for j in (i + 1)..neighbor_vec.len() {
685                    triplets += 1;
686                    if graph.contains_edge(neighbor_vec[i], neighbor_vec[j])
687                        || graph.contains_edge(neighbor_vec[j], neighbor_vec[i])
688                    {
689                        triangles += 1;
690                    }
691                }
692            }
693        }
694
695        if triplets > 0 {
696            3.0 * triangles as f64 / triplets as f64
697        } else {
698            0.0
699        }
700    }
701
702    /// Estimate average path length using sampling
703    fn estimate_average_path_length(&self, graph: &DependencyGraph) -> Option<f64> {
704        let nodes: Vec<_> = graph.nodes().collect();
705        if nodes.len() < 2 {
706            return None;
707        }
708
709        let sample_size = std::cmp::min(100, nodes.len());
710        let mut total_path_length = 0.0;
711        let mut valid_paths = 0;
712
713        for i in 0..sample_size {
714            let start_node = &nodes[i % nodes.len()];
715            let distances = self.bfs_distances(graph, start_node);
716
717            for distance in distances.values() {
718                if *distance > 0 && *distance < usize::MAX {
719                    total_path_length += *distance as f64;
720                    valid_paths += 1;
721                }
722            }
723        }
724
725        if valid_paths > 0 {
726            Some(total_path_length / valid_paths as f64)
727        } else {
728            None
729        }
730    }
731
732    /// Estimate graph diameter
733    fn estimate_diameter(&self, graph: &DependencyGraph) -> Option<usize> {
734        let nodes: Vec<_> = graph.nodes().collect();
735        if nodes.is_empty() {
736            return None;
737        }
738
739        let mut max_distance = 0;
740        let sample_size = std::cmp::min(20, nodes.len());
741
742        for i in 0..sample_size {
743            let start_node = &nodes[i % nodes.len()];
744            let distances = self.bfs_distances(graph, start_node);
745
746            for &distance in distances.values() {
747                if distance != usize::MAX && distance > max_distance {
748                    max_distance = distance;
749                }
750            }
751        }
752
753        if max_distance > 0 {
754            Some(max_distance)
755        } else {
756            None
757        }
758    }
759
760    /// BFS to compute distances from a source node
761    fn bfs_distances(&self, graph: &DependencyGraph, source: &NodeId) -> HashMap<NodeId, usize> {
762        let mut distances = HashMap::new();
763        let mut queue = VecDeque::new();
764
765        distances.insert(source.clone(), 0);
766        queue.push_back(source.clone());
767
768        while let Some(current) = queue.pop_front() {
769            let current_distance = distances[&current];
770
771            // Visit both outgoing and incoming neighbors (undirected traversal)
772            if let Some(outgoing) = graph.outgoing_neighbors(&current) {
773                for neighbor in outgoing {
774                    if !distances.contains_key(neighbor) {
775                        distances.insert(neighbor.clone(), current_distance + 1);
776                        queue.push_back(neighbor.clone());
777                    }
778                }
779            }
780
781            if let Some(incoming) = graph.incoming_neighbors(&current) {
782                for neighbor in incoming {
783                    if !distances.contains_key(neighbor) {
784                        distances.insert(neighbor.clone(), current_distance + 1);
785                        queue.push_back(neighbor.clone());
786                    }
787                }
788            }
789        }
790
791        distances
792    }
793
794    /// Check if graph is likely acyclic (DAG)
795    fn estimate_is_acyclic(&self, graph: &DependencyGraph) -> bool {
796        // Simple heuristic: if there are very few nodes with both in and out edges, likely acyclic
797        let nodes_with_both = graph
798            .nodes()
799            .filter(|&node| graph.in_degree(node) > 0 && graph.out_degree(node) > 0)
800            .count();
801
802        let total_nodes = graph.node_count();
803        if total_nodes == 0 {
804            return true;
805        }
806
807        let bidirectional_ratio = nodes_with_both as f64 / total_nodes as f64;
808        bidirectional_ratio < 0.3 // Heuristic threshold
809    }
810
811    /// Identify structural patterns in the graph
812    fn identify_structural_patterns(&self, graph: &DependencyGraph) -> Result<StructuralPatterns> {
813        // Find hubs (high out-degree)
814        let mut hub_candidates: Vec<_> = graph
815            .nodes()
816            .map(|node| NodeInfo {
817                node_id: node.clone(),
818                score: graph.out_degree(node) as f64,
819                in_degree: graph.in_degree(node),
820                out_degree: graph.out_degree(node),
821                metadata: None,
822            })
823            .collect();
824        hub_candidates.sort_by(|a, b| {
825            b.score
826                .partial_cmp(&a.score)
827                .unwrap_or(std::cmp::Ordering::Equal)
828        });
829        let hubs = hub_candidates
830            .into_iter()
831            .take(self.config.top_nodes_count)
832            .collect();
833
834        // Find authorities (high in-degree)
835        let mut authority_candidates: Vec<_> = graph
836            .nodes()
837            .map(|node| NodeInfo {
838                node_id: node.clone(),
839                score: graph.in_degree(node) as f64,
840                in_degree: graph.in_degree(node),
841                out_degree: graph.out_degree(node),
842                metadata: None,
843            })
844            .collect();
845        authority_candidates.sort_by(|a, b| {
846            b.score
847                .partial_cmp(&a.score)
848                .unwrap_or(std::cmp::Ordering::Equal)
849        });
850        let authorities = authority_candidates
851            .into_iter()
852            .take(self.config.top_nodes_count)
853            .collect();
854
855        // Identify special node types
856        let isolated_nodes: Vec<_> = graph
857            .nodes()
858            .filter(|&node| graph.degree(node) == 0)
859            .cloned()
860            .collect();
861
862        let dangling_nodes = graph.dangling_nodes().into_iter().cloned().collect();
863
864        let leaf_nodes: Vec<_> = graph
865            .nodes()
866            .filter(|&node| graph.in_degree(node) == 0 && graph.out_degree(node) > 0)
867            .cloned()
868            .collect();
869
870        // TODO: Implement betweenness centrality for bottlenecks and bridges
871        let bottlenecks = Vec::new();
872        let bridges = Vec::new();
873
874        Ok(StructuralPatterns {
875            hubs,
876            authorities,
877            bottlenecks,
878            bridges,
879            isolated_nodes,
880            dangling_nodes,
881            leaf_nodes,
882        })
883    }
884
885    /// Analyze import-specific patterns
886    fn analyze_import_patterns(&self, graph: &DependencyGraph) -> Result<ImportInsights> {
887        let fan_out_stats = {
888            let out_degrees: Vec<_> = graph.nodes().map(|node| graph.out_degree(node)).collect();
889            self.compute_degree_stats(&out_degrees)
890        };
891
892        let fan_in_stats = {
893            let in_degrees: Vec<_> = graph.nodes().map(|node| graph.in_degree(node)).collect();
894            self.compute_degree_stats(&in_degrees)
895        };
896
897        // TODO: Implement sophisticated cycle detection and dependency layering
898        let circular_dependencies = Vec::new();
899        let dependency_layers = Vec::new();
900        let critical_paths = Vec::new();
901
902        // Simple depth estimation
903        let max_import_depth = if let Some(diameter) = self.estimate_diameter(graph) {
904            diameter
905        } else {
906            0
907        };
908
909        let average_import_depth = if let Some(avg_path) = self.estimate_average_path_length(graph)
910        {
911            avg_path
912        } else {
913            0.0
914        };
915
916        Ok(ImportInsights {
917            average_import_depth,
918            max_import_depth,
919            fan_out_distribution: fan_out_stats,
920            fan_in_distribution: fan_in_stats,
921            circular_dependencies,
922            dependency_layers,
923            critical_paths,
924        })
925    }
926
927    /// Estimate performance characteristics for various graph algorithms
928    fn estimate_performance_characteristics(&self, graph: &DependencyGraph) -> PerformanceProfile {
929        let n = graph.node_count();
930        let m = graph.edge_count();
931
932        // PageRank memory estimate (2 score vectors + graph overhead)
933        let pagerank_memory_mb = if n > 0 {
934            let score_vector_size = n * std::mem::size_of::<f64>();
935            let graph_overhead = m * (std::mem::size_of::<String>() + std::mem::size_of::<usize>());
936            ((score_vector_size * 2 + graph_overhead) as f64) / (1024.0 * 1024.0)
937        } else {
938            0.0
939        };
940
941        // Rough PageRank time estimate (based on empirical observations)
942        let pagerank_time_estimate_ms = if n > 0 {
943            // Base time + iterations * (nodes + edges)
944            let base_time = 1;
945            let per_iteration_time = (n + m) / 10000; // Rough estimate
946            let estimated_iterations = if n < 1000 { 10 } else { 20 };
947            base_time + estimated_iterations * per_iteration_time.max(1)
948        } else {
949            0
950        } as u64;
951
952        let traversal_complexity = TraversalComplexity {
953            time_complexity_class: format!("O(V + E) = O({} + {})", n, m),
954            space_complexity_class: format!("O(V) = O({})", n),
955            expected_iterations: if n < 1000 { 10 } else { 20 },
956        };
957
958        let edges_per_node = if n > 0 { m as f64 / n as f64 } else { 0.0 };
959        let max_possible_edges = if n > 1 { n * (n - 1) } else { 1 };
960        let sparsity = 1.0 - (m as f64 / max_possible_edges as f64);
961
962        let storage_efficiency = StorageEfficiency {
963            adjacency_list_size_bytes: m
964                * (std::mem::size_of::<String>() + std::mem::size_of::<usize>()),
965            edges_per_node,
966            memory_overhead_ratio: 1.5, // Rough estimate for hash map overhead
967            sparsity,
968        };
969
970        PerformanceProfile {
971            pagerank_memory_estimate_mb: pagerank_memory_mb,
972            pagerank_time_estimate_ms,
973            traversal_complexity,
974            storage_efficiency,
975        }
976    }
977}
978
979impl Default for GraphStatisticsAnalyzer {
980    fn default() -> Self {
981        Self::new()
982    }
983}
984
985/// Utility functions for graph analysis results
986impl GraphAnalysisResults {
987    /// Generate a comprehensive summary report
988    pub fn summary_report(&self) -> String {
989        format!(
990            "Graph Analysis Summary Report\n\
991             ============================\n\
992             \n\
993             Basic Statistics:\n\
994             - Nodes: {} | Edges: {} | Density: {:.4}\n\
995             - Average in-degree: {:.2} | Average out-degree: {:.2}\n\
996             - Strongly connected components: {}\n\
997             \n\
998             Degree Distribution:\n\
999             - In-degree range: [{}, {}] (mean: {:.2}, std: {:.2})\n\
1000             - Out-degree range: [{}, {}] (mean: {:.2}, std: {:.2})\n\
1001             - Power-law alpha: {}\n\
1002             \n\
1003             Connectivity:\n\
1004             - Average clustering: {:.4}\n\
1005             - Average path length: {}\n\
1006             - Diameter: {} | Is acyclic: {}\n\
1007             \n\
1008             Structural Patterns:\n\
1009             - Top hubs: {} | Top authorities: {}\n\
1010             - Isolated nodes: {} | Dangling nodes: {} | Leaf nodes: {}\n\
1011             \n\
1012             Import Insights:\n\
1013             - Average import depth: {:.2} | Max depth: {}\n\
1014             - Circular dependencies: {}\n\
1015             \n\
1016             Performance Profile:\n\
1017             - PageRank memory estimate: {:.1} MB\n\
1018             - PageRank time estimate: {} ms\n\
1019             - Graph sparsity: {:.4}\n\
1020             \n\
1021             Analysis completed in {} ms (parallel: {})",
1022            self.basic_stats.total_nodes,
1023            self.basic_stats.total_edges,
1024            self.basic_stats.graph_density,
1025            self.basic_stats.in_degree_avg,
1026            self.basic_stats.out_degree_avg,
1027            self.basic_stats.strongly_connected_components,
1028            self.degree_distribution.in_degree.min,
1029            self.degree_distribution.in_degree.max,
1030            self.degree_distribution.in_degree.mean,
1031            self.degree_distribution.in_degree.std_dev,
1032            self.degree_distribution.out_degree.min,
1033            self.degree_distribution.out_degree.max,
1034            self.degree_distribution.out_degree.mean,
1035            self.degree_distribution.out_degree.std_dev,
1036            self.degree_distribution
1037                .power_law_alpha
1038                .map_or("N/A".to_string(), |a| format!("{:.3}", a)),
1039            self.connectivity.average_clustering,
1040            self.connectivity
1041                .average_path_length
1042                .map_or("N/A".to_string(), |l| format!("{:.2}", l)),
1043            self.connectivity
1044                .diameter
1045                .map_or("N/A".to_string(), |d| d.to_string()),
1046            self.connectivity.is_acyclic,
1047            self.structural_patterns.hubs.len(),
1048            self.structural_patterns.authorities.len(),
1049            self.structural_patterns.isolated_nodes.len(),
1050            self.structural_patterns.dangling_nodes.len(),
1051            self.structural_patterns.leaf_nodes.len(),
1052            self.import_insights.average_import_depth,
1053            self.import_insights.max_import_depth,
1054            self.import_insights.circular_dependencies.len(),
1055            self.performance_profile.pagerank_memory_estimate_mb,
1056            self.performance_profile.pagerank_time_estimate_ms,
1057            self.performance_profile.storage_efficiency.sparsity,
1058            self.analysis_metadata.analysis_duration_ms,
1059            self.analysis_metadata.used_parallel,
1060        )
1061    }
1062
1063    /// Get a list of the most important nodes by various metrics
1064    pub fn important_nodes_summary(&self) -> Vec<(String, Vec<String>)> {
1065        vec![
1066            (
1067                "Top Hubs (High Out-degree)".to_string(),
1068                self.structural_patterns
1069                    .hubs
1070                    .iter()
1071                    .map(|node| format!("{} (out: {})", node.node_id, node.out_degree))
1072                    .collect(),
1073            ),
1074            (
1075                "Top Authorities (High In-degree)".to_string(),
1076                self.structural_patterns
1077                    .authorities
1078                    .iter()
1079                    .map(|node| format!("{} (in: {})", node.node_id, node.in_degree))
1080                    .collect(),
1081            ),
1082        ]
1083    }
1084}
1085
1086#[cfg(test)]
1087mod tests {
1088    use super::*;
1089    use crate::graph::DependencyGraph;
1090
1091    fn create_test_graph() -> DependencyGraph {
1092        let mut graph = DependencyGraph::new();
1093
1094        // Create a more complex graph for testing
1095        graph
1096            .add_edge("main.py".to_string(), "utils.py".to_string())
1097            .unwrap();
1098        graph
1099            .add_edge("main.py".to_string(), "config.py".to_string())
1100            .unwrap();
1101        graph
1102            .add_edge("utils.py".to_string(), "config.py".to_string())
1103            .unwrap();
1104        graph
1105            .add_edge("test.py".to_string(), "main.py".to_string())
1106            .unwrap();
1107        graph
1108            .add_edge("test.py".to_string(), "utils.py".to_string())
1109            .unwrap();
1110        graph.add_node("isolated.py".to_string()).unwrap();
1111
1112        graph
1113    }
1114
1115    #[test]
1116    fn test_statistics_analyzer_creation() {
1117        let analyzer = GraphStatisticsAnalyzer::new();
1118        assert!(analyzer.config.compute_expensive_metrics);
1119
1120        let large_graph_analyzer = GraphStatisticsAnalyzer::for_large_graphs();
1121        assert!(!large_graph_analyzer.config.compute_expensive_metrics);
1122    }
1123
1124    #[test]
1125    fn test_basic_statistics() {
1126        let graph = create_test_graph();
1127        let analyzer = GraphStatisticsAnalyzer::new();
1128
1129        let stats = analyzer.compute_basic_statistics(&graph);
1130
1131        println!("Actual node count: {}, Expected: 6", stats.total_nodes);
1132        assert_eq!(stats.total_nodes, 5); // Fixed: should be 5 nodes (main.py, utils.py, config.py, test.py, isolated.py)
1133        assert_eq!(stats.total_edges, 5);
1134        assert!(stats.graph_density > 0.0);
1135        assert!(stats.graph_density < 1.0);
1136        assert_eq!(stats.isolated_nodes, 1); // isolated.py
1137    }
1138
1139    #[test]
1140    fn test_degree_statistics() {
1141        let analyzer = GraphStatisticsAnalyzer::new();
1142        let degrees = vec![0, 1, 1, 2, 3, 5, 8];
1143
1144        let stats = analyzer.compute_degree_stats(&degrees);
1145
1146        assert_eq!(stats.min, 0);
1147        assert_eq!(stats.max, 8);
1148        assert!((stats.mean - 2.857).abs() < 0.01); // 20/7
1149        assert_eq!(stats.median, 2.0);
1150        assert!(stats.std_dev > 0.0);
1151    }
1152
1153    #[test]
1154    fn test_percentile_calculation() {
1155        let analyzer = GraphStatisticsAnalyzer::new();
1156        let sorted_values = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
1157
1158        assert_eq!(analyzer.percentile(&sorted_values, 0.0), 1.0);
1159        assert_eq!(analyzer.percentile(&sorted_values, 50.0), 5.5);
1160        assert_eq!(analyzer.percentile(&sorted_values, 100.0), 10.0);
1161        assert!((analyzer.percentile(&sorted_values, 25.0) - 3.25).abs() < 0.01);
1162    }
1163
1164    #[test]
1165    fn test_histogram_computation() {
1166        let analyzer = GraphStatisticsAnalyzer::new();
1167        let degrees = vec![1, 1, 2, 2, 2, 3];
1168
1169        let histogram = analyzer.compute_histogram(&degrees);
1170
1171        assert_eq!(histogram[&1], 2);
1172        assert_eq!(histogram[&2], 3);
1173        assert_eq!(histogram[&3], 1);
1174    }
1175
1176    #[test]
1177    fn test_power_law_estimation() {
1178        let analyzer = GraphStatisticsAnalyzer::new();
1179
1180        // Power-law-like distribution
1181        let degrees = vec![1, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20];
1182        let alpha = analyzer.estimate_power_law_alpha(&degrees);
1183
1184        assert!(alpha.is_some());
1185        let alpha_val = alpha.unwrap();
1186        assert!(alpha_val > 0.0);
1187        assert!(alpha_val < 5.0); // Reasonable range
1188
1189        println!("Estimated power-law alpha: {:.3}", alpha_val);
1190    }
1191
1192    #[test]
1193    fn test_clustering_coefficient() {
1194        let mut graph = DependencyGraph::new();
1195
1196        // Create a triangle: A -> B, B -> C, C -> A
1197        graph.add_edge("A".to_string(), "B".to_string()).unwrap();
1198        graph.add_edge("B".to_string(), "C".to_string()).unwrap();
1199        graph.add_edge("C".to_string(), "A".to_string()).unwrap();
1200
1201        let analyzer = GraphStatisticsAnalyzer::new();
1202        let clustering = analyzer.local_clustering_coefficient(&graph, &"A".to_string());
1203
1204        // Should be high since A's neighbors (B and C) are connected
1205        assert!(clustering >= 0.0);
1206        assert!(clustering <= 1.0);
1207        println!("Clustering coefficient for A: {:.3}", clustering);
1208    }
1209
1210    #[test]
1211    fn test_bfs_distances() {
1212        let graph = create_test_graph();
1213        let analyzer = GraphStatisticsAnalyzer::new();
1214
1215        let distances = analyzer.bfs_distances(&graph, &"main.py".to_string());
1216
1217        assert_eq!(distances["main.py"], 0);
1218        assert!(distances.contains_key("utils.py"));
1219        assert!(distances.contains_key("config.py"));
1220        assert!(distances["utils.py"] <= distances["config.py"]);
1221    }
1222
1223    #[test]
1224    fn test_structural_patterns() {
1225        let graph = create_test_graph();
1226        let analyzer = GraphStatisticsAnalyzer::new();
1227
1228        let patterns = analyzer.identify_structural_patterns(&graph).unwrap();
1229
1230        // main.py should be a hub (high out-degree)
1231        let main_py_hub = patterns.hubs.iter().find(|node| node.node_id == "main.py");
1232        assert!(main_py_hub.is_some());
1233
1234        // config.py should be an authority (high in-degree)
1235        let config_py_auth = patterns
1236            .authorities
1237            .iter()
1238            .find(|node| node.node_id == "config.py");
1239        assert!(config_py_auth.is_some());
1240
1241        // isolated.py should be in isolated nodes
1242        assert!(patterns.isolated_nodes.contains(&"isolated.py".to_string()));
1243    }
1244
1245    #[test]
1246    fn test_full_analysis() {
1247        let graph = create_test_graph();
1248        let analyzer = GraphStatisticsAnalyzer::new();
1249
1250        let analysis = analyzer.analyze(&graph).unwrap();
1251
1252        // Basic checks
1253        assert_eq!(analysis.basic_stats.total_nodes, 5); // Fixed: should be 5 nodes
1254        assert_eq!(analysis.basic_stats.total_edges, 5);
1255
1256        // Degree distribution
1257        assert!(analysis.degree_distribution.in_degree.mean >= 0.0);
1258        assert!(analysis.degree_distribution.out_degree.mean >= 0.0);
1259
1260        // Connectivity
1261        assert!(analysis.connectivity.graph_density >= 0.0);
1262        assert!(analysis.connectivity.graph_density <= 1.0);
1263
1264        // Performance profile
1265        assert!(analysis.performance_profile.pagerank_memory_estimate_mb >= 0.0);
1266        assert!(analysis.performance_profile.pagerank_time_estimate_ms >= 0);
1267
1268        // Analysis metadata (duration may be 0 on fast systems)
1269        assert!(analysis.analysis_metadata.analysis_duration_ms >= 0);
1270        assert_eq!(
1271            analysis.analysis_metadata.version,
1272            env!("CARGO_PKG_VERSION")
1273        );
1274    }
1275
1276    #[test]
1277    fn test_summary_report() {
1278        let graph = create_test_graph();
1279        let analyzer = GraphStatisticsAnalyzer::new();
1280        let analysis = analyzer.analyze(&graph).unwrap();
1281
1282        let summary = analysis.summary_report();
1283
1284        assert!(summary.contains("Graph Analysis Summary Report"));
1285        assert!(summary.contains("Basic Statistics"));
1286        assert!(summary.contains("Degree Distribution"));
1287        assert!(summary.contains("Connectivity"));
1288        assert!(summary.contains("Structural Patterns"));
1289        assert!(summary.contains("Import Insights"));
1290        assert!(summary.contains("Performance Profile"));
1291
1292        println!("Summary Report:\n{}", summary);
1293    }
1294
1295    #[test]
1296    fn test_important_nodes_summary() {
1297        let graph = create_test_graph();
1298        let analyzer = GraphStatisticsAnalyzer::new();
1299        let analysis = analyzer.analyze(&graph).unwrap();
1300
1301        let important_nodes = analysis.important_nodes_summary();
1302
1303        assert!(!important_nodes.is_empty());
1304        assert_eq!(important_nodes.len(), 2); // Hubs and Authorities
1305
1306        for (category, nodes) in &important_nodes {
1307            println!("{}: {:?}", category, nodes);
1308        }
1309    }
1310
1311    #[test]
1312    fn test_empty_graph_analysis() {
1313        let graph = DependencyGraph::new();
1314        let analyzer = GraphStatisticsAnalyzer::new();
1315
1316        let analysis = analyzer.analyze(&graph).unwrap();
1317
1318        assert_eq!(analysis.basic_stats.total_nodes, 0);
1319        assert_eq!(analysis.basic_stats.total_edges, 0);
1320        assert_eq!(analysis.degree_distribution.in_degree.mean, 0.0);
1321        assert!(analysis.structural_patterns.hubs.is_empty());
1322        assert!(analysis.structural_patterns.authorities.is_empty());
1323    }
1324}