scribe_graph/
graph.rs

1//! # Graph Data Structures for PageRank Centrality
2//!
3//! Efficient graph representation optimized for dependency analysis and PageRank computation.
4//! This module implements the core graph structures used in the PageRank centrality algorithm
5//! with emphasis on reverse edges for code dependency analysis.
6//!
7//! ## Design Philosophy
8//! - **Forward edges**: A imports B (A -> B)  
9//! - **Reverse edges**: B is imported by A (B <- A)
10//! - **PageRank flows along reverse edges** (importance flows to imported files)
11//! - **Memory-efficient adjacency list representation** for large graphs (10k+ nodes)
12//! - **Fast degree queries** and statistics calculation
13
14use dashmap::DashMap;
15use parking_lot::RwLock;
16use scribe_core::{error::ScribeError, file, Language, Result};
17use serde::{Deserialize, Serialize};
18use std::collections::{HashMap, HashSet};
19
20/// Internal node identifier type for efficient graph operations (usize for array indexing)
21pub type InternalNodeId = usize;
22
23/// External node identifier type for the dependency graph (file paths)
24pub type NodeId = String;
25
26/// Edge weight type (unused in unweighted PageRank, but reserved for extensions)
27pub type EdgeWeight = f64;
28
29/// Direction for graph traversal
30#[derive(Debug, Clone, Copy, PartialEq, Eq)]
31pub enum TraversalDirection {
32    /// Traverse along outgoing edges (dependencies)
33    Dependencies,
34    /// Traverse along incoming edges (dependents)
35    Dependents,
36    /// Traverse in both directions
37    Both,
38}
39
40/// Efficient dependency graph representation optimized for PageRank computation
41/// Uses integer-based internal representation for massive performance improvements
42#[derive(Debug, Clone)]
43pub struct DependencyGraph {
44    /// Forward adjacency list: internal_id -> set of internal_ids it imports
45    forward_edges: Vec<HashSet<InternalNodeId>>,
46
47    /// Reverse adjacency list: internal_id -> set of internal_ids that import it (for PageRank)
48    reverse_edges: Vec<HashSet<InternalNodeId>>,
49
50    /// Mapping from file path to internal node ID
51    path_to_id: HashMap<NodeId, InternalNodeId>,
52
53    /// Mapping from internal node ID to file path
54    id_to_path: Vec<NodeId>,
55
56    /// Node metadata cache (indexed by internal ID)
57    node_metadata: Vec<Option<NodeMetadata>>,
58
59    /// Graph statistics cache (invalidated on mutations)
60    stats_cache: Option<GraphStatistics>,
61
62    /// Next available internal node ID
63    next_id: InternalNodeId,
64}
65
66/// Metadata associated with each node in the graph
67#[derive(Debug, Clone, PartialEq)]
68pub struct NodeMetadata {
69    /// File path of the node
70    pub file_path: String,
71    /// Programming language detected
72    pub language: Option<String>,
73    /// Whether this is an entrypoint file
74    pub is_entrypoint: bool,
75    /// Whether this is a test file
76    pub is_test: bool,
77    /// File size in bytes (for statistics)
78    pub size_bytes: u64,
79}
80
81impl NodeMetadata {
82    /// Create new node metadata
83    pub fn new(file_path: String) -> Self {
84        let path = std::path::Path::new(&file_path);
85        let language_enum = file::detect_language_from_path(path);
86        let language = if matches!(language_enum, Language::Unknown) {
87            None
88        } else {
89            Some(file::language_display_name(&language_enum).to_lowercase())
90        };
91        let is_entrypoint = file::is_entrypoint_path(path, &language_enum);
92        let is_test = file::is_test_path(path);
93
94        Self {
95            file_path,
96            language,
97            is_entrypoint,
98            is_test,
99            size_bytes: 0,
100        }
101    }
102
103    /// Create with size information
104    pub fn with_size(mut self, size_bytes: u64) -> Self {
105        self.size_bytes = size_bytes;
106        self
107    }
108}
109
110/// Graph construction and manipulation operations
111impl DependencyGraph {
112    /// Create a new empty dependency graph
113    pub fn new() -> Self {
114        Self {
115            forward_edges: Vec::new(),
116            reverse_edges: Vec::new(),
117            path_to_id: HashMap::new(),
118            id_to_path: Vec::new(),
119            node_metadata: Vec::new(),
120            stats_cache: None,
121            next_id: 0,
122        }
123    }
124
125    /// Create with initial capacity hint for performance optimization
126    pub fn with_capacity(capacity: usize) -> Self {
127        Self {
128            forward_edges: Vec::with_capacity(capacity),
129            reverse_edges: Vec::with_capacity(capacity),
130            path_to_id: HashMap::with_capacity(capacity),
131            id_to_path: Vec::with_capacity(capacity),
132            node_metadata: Vec::with_capacity(capacity),
133            stats_cache: None,
134            next_id: 0,
135        }
136    }
137
138    /// Add a node to the graph (can exist without edges)
139    pub fn add_node(&mut self, node_id: NodeId) -> Result<InternalNodeId> {
140        // Check if node already exists
141        if let Some(&existing_id) = self.path_to_id.get(&node_id) {
142            return Ok(existing_id);
143        }
144
145        let internal_id = self.next_id;
146        self.next_id += 1;
147
148        // Add to mappings
149        self.path_to_id.insert(node_id.clone(), internal_id);
150        self.id_to_path.push(node_id.clone());
151
152        // Initialize empty adjacency lists
153        self.forward_edges.push(HashSet::new());
154        self.reverse_edges.push(HashSet::new());
155
156        // Add default metadata
157        self.node_metadata.push(Some(NodeMetadata::new(node_id)));
158
159        // Invalidate cache
160        self.stats_cache = None;
161
162        Ok(internal_id)
163    }
164
165    /// Add a node with metadata
166    pub fn add_node_with_metadata(
167        &mut self,
168        node_id: NodeId,
169        metadata: NodeMetadata,
170    ) -> Result<InternalNodeId> {
171        let internal_id = self.add_node(node_id)?;
172        self.node_metadata[internal_id] = Some(metadata);
173        Ok(internal_id)
174    }
175
176    /// Add an import edge: from_file imports to_file
177    pub fn add_edge(&mut self, from_node: NodeId, to_node: NodeId) -> Result<()> {
178        // Ensure both nodes exist and get their internal IDs
179        let from_id = self.add_node(from_node)?;
180        let to_id = self.add_node(to_node)?;
181
182        // Add forward edge: from_id -> to_id
183        self.forward_edges[from_id].insert(to_id);
184
185        // Add reverse edge: to_id <- from_id
186        self.reverse_edges[to_id].insert(from_id);
187
188        // Invalidate cache
189        self.stats_cache = None;
190
191        Ok(())
192    }
193
194    /// Add multiple edges efficiently (batch operation)
195    pub fn add_edges(&mut self, edges: &[(NodeId, NodeId)]) -> Result<()> {
196        for (from_node, to_node) in edges {
197            self.add_edge(from_node.clone(), to_node.clone())?;
198        }
199        Ok(())
200    }
201
202    /// Remove a node and all its edges
203    pub fn remove_node(&mut self, node_id: &NodeId) -> Result<bool> {
204        let internal_id = match self.path_to_id.get(node_id) {
205            Some(&id) => id,
206            None => return Ok(false),
207        };
208
209        // Get outgoing edges to clean up reverse references
210        let outgoing = self.forward_edges[internal_id].clone();
211        for target_id in &outgoing {
212            self.reverse_edges[*target_id].remove(&internal_id);
213        }
214
215        // Get incoming edges to clean up forward references
216        let incoming = self.reverse_edges[internal_id].clone();
217        for source_id in &incoming {
218            self.forward_edges[*source_id].remove(&internal_id);
219        }
220
221        // Clear the adjacency lists for this node
222        self.forward_edges[internal_id].clear();
223        self.reverse_edges[internal_id].clear();
224
225        // Remove metadata
226        self.node_metadata[internal_id] = None;
227
228        // Remove from path mapping (but keep internal_id for consistency)
229        self.path_to_id.remove(node_id);
230
231        // Note: We don't remove from id_to_path to maintain index consistency
232        // Instead, we'll need to handle None cases when iterating
233
234        // Invalidate cache
235        self.stats_cache = None;
236
237        Ok(true)
238    }
239
240    /// Remove an edge between two nodes
241    pub fn remove_edge(&mut self, from_node: &NodeId, to_node: &NodeId) -> Result<bool> {
242        let from_id = match self.path_to_id.get(from_node) {
243            Some(&id) => id,
244            None => return Ok(false),
245        };
246
247        let to_id = match self.path_to_id.get(to_node) {
248            Some(&id) => id,
249            None => return Ok(false),
250        };
251
252        let forward_removed = self.forward_edges[from_id].remove(&to_id);
253        let reverse_removed = self.reverse_edges[to_id].remove(&from_id);
254
255        if forward_removed || reverse_removed {
256            self.stats_cache = None;
257        }
258
259        Ok(forward_removed || reverse_removed)
260    }
261
262    /// Check if a node exists in the graph
263    pub fn contains_node(&self, node_id: &NodeId) -> bool {
264        self.path_to_id.contains_key(node_id)
265    }
266
267    /// Check if an edge exists between two nodes
268    pub fn contains_edge(&self, from_node: &NodeId, to_node: &NodeId) -> bool {
269        match (self.path_to_id.get(from_node), self.path_to_id.get(to_node)) {
270            (Some(&from_id), Some(&to_id)) => self.forward_edges[from_id].contains(&to_id),
271            _ => false,
272        }
273    }
274
275    /// Get the number of nodes in the graph
276    pub fn node_count(&self) -> usize {
277        self.path_to_id.len()
278    }
279
280    /// Get the total number of edges in the graph
281    pub fn edge_count(&self) -> usize {
282        self.forward_edges.iter().map(|edges| edges.len()).sum()
283    }
284
285    /// Get all nodes in the graph
286    pub fn nodes(&self) -> impl Iterator<Item = &NodeId> {
287        self.path_to_id.keys()
288    }
289
290    /// Get all edges in the graph as (from, to) pairs
291    pub fn edges(&self) -> impl Iterator<Item = (String, String)> + '_ {
292        self.forward_edges
293            .iter()
294            .enumerate()
295            .flat_map(move |(from_id, targets)| {
296                let from_path = self.id_to_path[from_id].clone();
297                targets.iter().map(move |&to_id| {
298                    let to_path = self.id_to_path[to_id].clone();
299                    (from_path.clone(), to_path)
300                })
301            })
302    }
303}
304
305/// Degree and neighbor query operations
306impl DependencyGraph {
307    /// Get in-degree of a node (number of files that import this node)
308    pub fn in_degree(&self, node_id: &NodeId) -> usize {
309        match self.path_to_id.get(node_id) {
310            Some(&internal_id) => self.reverse_edges[internal_id].len(),
311            None => 0,
312        }
313    }
314
315    /// Get out-degree of a node (number of files this node imports)
316    pub fn out_degree(&self, node_id: &NodeId) -> usize {
317        match self.path_to_id.get(node_id) {
318            Some(&internal_id) => self.forward_edges[internal_id].len(),
319            None => 0,
320        }
321    }
322
323    /// Get total degree of a node (in + out)
324    pub fn degree(&self, node_id: &NodeId) -> usize {
325        self.in_degree(node_id) + self.out_degree(node_id)
326    }
327
328    /// Get nodes that this node imports (outgoing edges)
329    pub fn outgoing_neighbors(&self, node_id: &NodeId) -> Option<Vec<&NodeId>> {
330        match self.path_to_id.get(node_id) {
331            Some(&internal_id) => {
332                let neighbors: Vec<&NodeId> = self.forward_edges[internal_id]
333                    .iter()
334                    .map(|&target_id| &self.id_to_path[target_id])
335                    .collect();
336                Some(neighbors)
337            }
338            None => None,
339        }
340    }
341
342    /// Get nodes that import this node (incoming edges) - important for PageRank
343    pub fn incoming_neighbors(&self, node_id: &NodeId) -> Option<Vec<&NodeId>> {
344        match self.path_to_id.get(node_id) {
345            Some(&internal_id) => {
346                let neighbors: Vec<&NodeId> = self.reverse_edges[internal_id]
347                    .iter()
348                    .map(|&source_id| &self.id_to_path[source_id])
349                    .collect();
350                Some(neighbors)
351            }
352            None => None,
353        }
354    }
355
356    /// Get both incoming and outgoing neighbors
357    pub fn all_neighbors(&self, node_id: &NodeId) -> HashSet<&NodeId> {
358        let mut neighbors = HashSet::new();
359
360        if let Some(&internal_id) = self.path_to_id.get(node_id) {
361            // Add outgoing neighbors
362            for &target_id in &self.forward_edges[internal_id] {
363                neighbors.insert(&self.id_to_path[target_id]);
364            }
365
366            // Add incoming neighbors
367            for &source_id in &self.reverse_edges[internal_id] {
368                neighbors.insert(&self.id_to_path[source_id]);
369            }
370        }
371
372        neighbors
373    }
374
375    /// Get all transitive dependencies of a node (files it depends on, transitively)
376    ///
377    /// Performs BFS along outgoing edges to find all files this node imports,
378    /// directly or indirectly, up to max_depth levels.
379    ///
380    /// # Arguments
381    /// * `node_id` - The starting node
382    /// * `max_depth` - Maximum depth to traverse (None for unlimited)
383    ///
384    /// # Returns
385    /// Set of all transitively reachable nodes via outgoing edges (dependencies)
386    pub fn transitive_dependencies(&self, node_id: &NodeId, max_depth: Option<usize>) -> HashSet<NodeId> {
387        use std::collections::VecDeque;
388
389        let mut result = HashSet::new();
390        let mut visited = HashSet::new();
391        let mut queue = VecDeque::new();
392
393        // Start from the given node
394        if !self.contains_node(node_id) {
395            return result;
396        }
397
398        queue.push_back((node_id.clone(), 0));
399        visited.insert(node_id.clone());
400
401        while let Some((current, depth)) = queue.pop_front() {
402            // Check depth limit
403            if let Some(max_d) = max_depth {
404                if depth >= max_d {
405                    continue;
406                }
407            }
408
409            // Get outgoing neighbors (dependencies)
410            if let Some(neighbors) = self.outgoing_neighbors(&current) {
411                for neighbor in neighbors {
412                    if !visited.contains(neighbor) {
413                        visited.insert(neighbor.clone());
414                        result.insert(neighbor.clone());
415                        queue.push_back((neighbor.clone(), depth + 1));
416                    }
417                }
418            }
419        }
420
421        result
422    }
423
424    /// Get all transitive dependents of a node (files that depend on it, transitively)
425    ///
426    /// Performs BFS along incoming edges to find all files that import this node,
427    /// directly or indirectly, up to max_depth levels.
428    ///
429    /// # Arguments
430    /// * `node_id` - The starting node
431    /// * `max_depth` - Maximum depth to traverse (None for unlimited)
432    ///
433    /// # Returns
434    /// Set of all transitively reachable nodes via incoming edges (dependents)
435    pub fn transitive_dependents(&self, node_id: &NodeId, max_depth: Option<usize>) -> HashSet<NodeId> {
436        use std::collections::VecDeque;
437
438        let mut result = HashSet::new();
439        let mut visited = HashSet::new();
440        let mut queue = VecDeque::new();
441
442        // Start from the given node
443        if !self.contains_node(node_id) {
444            return result;
445        }
446
447        queue.push_back((node_id.clone(), 0));
448        visited.insert(node_id.clone());
449
450        while let Some((current, depth)) = queue.pop_front() {
451            // Check depth limit
452            if let Some(max_d) = max_depth {
453                if depth >= max_d {
454                    continue;
455                }
456            }
457
458            // Get incoming neighbors (dependents)
459            if let Some(neighbors) = self.incoming_neighbors(&current) {
460                for neighbor in neighbors {
461                    if !visited.contains(neighbor) {
462                        visited.insert(neighbor.clone());
463                        result.insert(neighbor.clone());
464                        queue.push_back((neighbor.clone(), depth + 1));
465                    }
466                }
467            }
468        }
469
470        result
471    }
472
473    /// Compute the closure of a set of seed nodes
474    ///
475    /// # Arguments
476    /// * `seeds` - Starting set of nodes
477    /// * `direction` - Which direction to traverse
478    /// * `max_depth` - Maximum traversal depth (None for unlimited)
479    ///
480    /// # Returns
481    /// Set containing all seeds plus all reachable nodes in the specified direction
482    pub fn compute_closure(
483        &self,
484        seeds: &[NodeId],
485        direction: TraversalDirection,
486        max_depth: Option<usize>,
487    ) -> HashSet<NodeId> {
488        let mut result: HashSet<NodeId> = seeds.iter().cloned().collect();
489
490        for seed in seeds {
491            let reachable = match direction {
492                TraversalDirection::Dependencies => self.transitive_dependencies(seed, max_depth),
493                TraversalDirection::Dependents => self.transitive_dependents(seed, max_depth),
494                TraversalDirection::Both => {
495                    let mut combined = self.transitive_dependencies(seed, max_depth);
496                    combined.extend(self.transitive_dependents(seed, max_depth));
497                    combined
498                }
499            };
500            result.extend(reachable);
501        }
502
503        result
504    }
505
506    /// Get degree information for a node
507    pub fn get_degree_info(&self, node_id: &NodeId) -> Option<DegreeInfo> {
508        if !self.contains_node(node_id) {
509            return None;
510        }
511
512        Some(DegreeInfo {
513            node_id: node_id.clone(),
514            in_degree: self.in_degree(node_id),
515            out_degree: self.out_degree(node_id),
516            total_degree: self.degree(node_id),
517        })
518    }
519
520    /// Internal API: Get internal node ID for path (for PageRank optimization)
521    pub(crate) fn get_internal_id(&self, node_id: &NodeId) -> Option<InternalNodeId> {
522        self.path_to_id.get(node_id).copied()
523    }
524
525    /// Internal API: Get path for internal ID
526    pub(crate) fn get_path(&self, internal_id: InternalNodeId) -> Option<&NodeId> {
527        self.id_to_path.get(internal_id)
528    }
529
530    /// Internal API: Get incoming neighbors by internal ID (for PageRank)
531    pub(crate) fn incoming_neighbors_by_id(
532        &self,
533        internal_id: InternalNodeId,
534    ) -> Option<&HashSet<InternalNodeId>> {
535        self.reverse_edges.get(internal_id)
536    }
537
538    /// Internal API: Get out-degree by internal ID (for PageRank)
539    pub(crate) fn out_degree_by_id(&self, internal_id: InternalNodeId) -> usize {
540        self.forward_edges
541            .get(internal_id)
542            .map_or(0, |edges| edges.len())
543    }
544
545    /// Internal API: Get total number of active nodes (for PageRank)
546    pub(crate) fn internal_node_count(&self) -> usize {
547        self.path_to_id.len()
548    }
549
550    /// Internal API: Iterator over all internal node IDs with their paths
551    pub(crate) fn internal_nodes(&self) -> impl Iterator<Item = (InternalNodeId, &NodeId)> {
552        self.path_to_id.iter().map(|(path, &id)| (id, path))
553    }
554}
555
556/// Node metadata and information queries
557impl DependencyGraph {
558    /// Get metadata for a node
559    pub fn node_metadata(&self, node_id: &NodeId) -> Option<&NodeMetadata> {
560        match self.path_to_id.get(node_id) {
561            Some(&internal_id) => self.node_metadata[internal_id].as_ref(),
562            None => None,
563        }
564    }
565
566    /// Set metadata for a node
567    pub fn set_node_metadata(&mut self, node_id: NodeId, metadata: NodeMetadata) -> Result<()> {
568        match self.path_to_id.get(&node_id) {
569            Some(&internal_id) => {
570                self.node_metadata[internal_id] = Some(metadata);
571                Ok(())
572            }
573            None => Err(ScribeError::invalid_operation(
574                format!("Node {} does not exist in graph", node_id),
575                "set_node_metadata".to_string(),
576            )),
577        }
578    }
579
580    /// Get all entrypoint nodes
581    pub fn entrypoint_nodes(&self) -> Vec<&NodeId> {
582        self.node_metadata
583            .iter()
584            .enumerate()
585            .filter_map(|(internal_id, meta_opt)| {
586                if let Some(meta) = meta_opt {
587                    if meta.is_entrypoint {
588                        return Some(&self.id_to_path[internal_id]);
589                    }
590                }
591                None
592            })
593            .collect()
594    }
595
596    /// Get all test nodes
597    pub fn test_nodes(&self) -> Vec<&NodeId> {
598        self.node_metadata
599            .iter()
600            .enumerate()
601            .filter_map(|(internal_id, meta_opt)| {
602                if let Some(meta) = meta_opt {
603                    if meta.is_test {
604                        return Some(&self.id_to_path[internal_id]);
605                    }
606                }
607                None
608            })
609            .collect()
610    }
611
612    /// Get nodes by language
613    pub fn nodes_by_language(&self, language: &str) -> Vec<&NodeId> {
614        self.node_metadata
615            .iter()
616            .enumerate()
617            .filter_map(|(internal_id, meta_opt)| {
618                if let Some(meta) = meta_opt {
619                    if meta.language.as_deref() == Some(language) {
620                        return Some(&self.id_to_path[internal_id]);
621                    }
622                }
623                None
624            })
625            .collect()
626    }
627}
628
629/// Specialized operations for PageRank computation
630impl DependencyGraph {
631    /// Get all nodes with their reverse edge neighbors (for PageRank iteration)
632    pub fn pagerank_iterator(&self) -> impl Iterator<Item = (&NodeId, Option<Vec<&NodeId>>)> + '_ {
633        self.path_to_id.iter().map(|(node_path, &internal_id)| {
634            let incoming: Option<Vec<&NodeId>> = if !self.reverse_edges[internal_id].is_empty() {
635                Some(
636                    self.reverse_edges[internal_id]
637                        .iter()
638                        .map(|&source_id| &self.id_to_path[source_id])
639                        .collect(),
640                )
641            } else {
642                Some(Vec::new())
643            };
644            (node_path, incoming)
645        })
646    }
647
648    /// Get dangling nodes (nodes with no outgoing edges)
649    pub fn dangling_nodes(&self) -> Vec<&NodeId> {
650        self.path_to_id
651            .iter()
652            .filter(|(_, &internal_id)| self.forward_edges[internal_id].is_empty())
653            .map(|(node_path, _)| node_path)
654            .collect()
655    }
656
657    /// Get strongly connected components (simplified estimation for statistics)
658    pub fn estimate_scc_count(&self) -> usize {
659        if self.path_to_id.is_empty() {
660            return 0;
661        }
662
663        // Count nodes with both in and out edges (likely in cycles)
664        let potential_scc_nodes = self
665            .path_to_id
666            .iter()
667            .filter(|(_, &internal_id)| {
668                !self.reverse_edges[internal_id].is_empty()
669                    && !self.forward_edges[internal_id].is_empty()
670            })
671            .count();
672
673        // Rough estimate: most SCCs are small, assume average size of 3
674        let estimated_scc = if potential_scc_nodes > 0 {
675            std::cmp::max(1, potential_scc_nodes / 3)
676        } else {
677            0
678        };
679
680        // Add isolated nodes and simple chains
681        let isolated_nodes = self.path_to_id.len() - potential_scc_nodes;
682        estimated_scc + isolated_nodes
683    }
684
685    /// Check if the graph is strongly connected (simplified check)
686    pub fn is_strongly_connected(&self) -> bool {
687        if self.path_to_id.is_empty() {
688            return true;
689        }
690
691        // Simplified check: all nodes have both in and out edges
692        self.path_to_id.iter().all(|(_, &internal_id)| {
693            !self.reverse_edges[internal_id].is_empty()
694                && !self.forward_edges[internal_id].is_empty()
695        })
696    }
697}
698
699/// Concurrent graph operations for performance
700impl DependencyGraph {
701    /// Create a thread-safe concurrent graph for parallel operations
702    pub fn into_concurrent(self) -> ConcurrentDependencyGraph {
703        // Convert Vec<HashSet> to DashMap representation for concurrency
704        let forward_edges = DashMap::new();
705        let reverse_edges = DashMap::new();
706
707        for (internal_id, edge_set) in self.forward_edges.into_iter().enumerate() {
708            forward_edges.insert(internal_id, edge_set);
709        }
710
711        for (internal_id, edge_set) in self.reverse_edges.into_iter().enumerate() {
712            reverse_edges.insert(internal_id, edge_set);
713        }
714
715        ConcurrentDependencyGraph {
716            forward_edges,
717            reverse_edges,
718            path_to_id: DashMap::from_iter(self.path_to_id),
719            id_to_path: RwLock::new(self.id_to_path),
720            node_metadata: RwLock::new(self.node_metadata),
721            stats_cache: RwLock::new(self.stats_cache),
722            next_id: RwLock::new(self.next_id),
723        }
724    }
725}
726
727/// Thread-safe concurrent version of DependencyGraph
728#[derive(Debug)]
729pub struct ConcurrentDependencyGraph {
730    forward_edges: DashMap<InternalNodeId, HashSet<InternalNodeId>>,
731    reverse_edges: DashMap<InternalNodeId, HashSet<InternalNodeId>>,
732    path_to_id: DashMap<NodeId, InternalNodeId>,
733    id_to_path: RwLock<Vec<NodeId>>,
734    node_metadata: RwLock<Vec<Option<NodeMetadata>>>,
735    stats_cache: RwLock<Option<GraphStatistics>>,
736    next_id: RwLock<InternalNodeId>,
737}
738
739impl ConcurrentDependencyGraph {
740    /// Add a node concurrently
741    pub fn add_node(&self, node_id: NodeId) -> Result<InternalNodeId> {
742        // Check if node already exists
743        if let Some(existing_id) = self.path_to_id.get(&node_id) {
744            return Ok(*existing_id);
745        }
746
747        let internal_id = {
748            let mut next_id = self.next_id.write();
749            let id = *next_id;
750            *next_id += 1;
751            id
752        };
753
754        // Add to mappings
755        self.path_to_id.insert(node_id.clone(), internal_id);
756        {
757            let mut id_to_path = self.id_to_path.write();
758            id_to_path.push(node_id.clone());
759        }
760
761        // Initialize empty adjacency lists
762        self.forward_edges.insert(internal_id, HashSet::new());
763        self.reverse_edges.insert(internal_id, HashSet::new());
764
765        // Add default metadata
766        {
767            let mut metadata = self.node_metadata.write();
768            metadata.push(Some(NodeMetadata::new(node_id)));
769        }
770
771        // Invalidate stats cache
772        *self.stats_cache.write() = None;
773
774        Ok(internal_id)
775    }
776
777    /// Get in-degree concurrently
778    pub fn in_degree(&self, node_id: &NodeId) -> usize {
779        match self.path_to_id.get(node_id) {
780            Some(internal_id) => self
781                .reverse_edges
782                .get(&internal_id)
783                .map_or(0, |entry| entry.len()),
784            None => 0,
785        }
786    }
787
788    /// Get out-degree concurrently  
789    pub fn out_degree(&self, node_id: &NodeId) -> usize {
790        match self.path_to_id.get(node_id) {
791            Some(internal_id) => self
792                .forward_edges
793                .get(&internal_id)
794                .map_or(0, |entry| entry.len()),
795            None => 0,
796        }
797    }
798
799    /// Convert back to single-threaded graph
800    pub fn into_sequential(self) -> DependencyGraph {
801        let id_to_path = self.id_to_path.into_inner();
802        let node_metadata = self.node_metadata.into_inner();
803        let stats_cache = self.stats_cache.into_inner();
804        let next_id = self.next_id.into_inner();
805
806        // Convert DashMap back to Vec
807        let mut forward_edges = vec![HashSet::new(); next_id];
808        let mut reverse_edges = vec![HashSet::new(); next_id];
809
810        for (internal_id, edge_set) in self.forward_edges.into_iter() {
811            if internal_id < forward_edges.len() {
812                forward_edges[internal_id] = edge_set;
813            }
814        }
815
816        for (internal_id, edge_set) in self.reverse_edges.into_iter() {
817            if internal_id < reverse_edges.len() {
818                reverse_edges[internal_id] = edge_set;
819            }
820        }
821
822        DependencyGraph {
823            forward_edges,
824            reverse_edges,
825            path_to_id: self.path_to_id.into_iter().collect(),
826            id_to_path,
827            node_metadata,
828            stats_cache,
829            next_id,
830        }
831    }
832}
833
834/// Degree information for a node
835#[derive(Debug, Clone, PartialEq)]
836pub struct DegreeInfo {
837    pub node_id: NodeId,
838    pub in_degree: usize,
839    pub out_degree: usize,
840    pub total_degree: usize,
841}
842
843/// Graph statistics computed lazily and cached
844#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
845pub struct GraphStatistics {
846    /// Total number of nodes
847    pub total_nodes: usize,
848    /// Total number of edges
849    pub total_edges: usize,
850    /// Average in-degree
851    pub in_degree_avg: f64,
852    /// Maximum in-degree
853    pub in_degree_max: usize,
854    /// Average out-degree
855    pub out_degree_avg: f64,
856    /// Maximum out-degree
857    pub out_degree_max: usize,
858    /// Estimated number of strongly connected components
859    pub strongly_connected_components: usize,
860    /// Graph density (actual_edges / possible_edges)
861    pub graph_density: f64,
862    /// Number of isolated nodes (no edges)
863    pub isolated_nodes: usize,
864    /// Number of dangling nodes (no outgoing edges)
865    pub dangling_nodes: usize,
866}
867
868impl GraphStatistics {
869    /// Create empty statistics
870    pub fn empty() -> Self {
871        Self {
872            total_nodes: 0,
873            total_edges: 0,
874            in_degree_avg: 0.0,
875            in_degree_max: 0,
876            out_degree_avg: 0.0,
877            out_degree_max: 0,
878            strongly_connected_components: 0,
879            graph_density: 0.0,
880            isolated_nodes: 0,
881            dangling_nodes: 0,
882        }
883    }
884}
885
886impl Default for DependencyGraph {
887    fn default() -> Self {
888        Self::new()
889    }
890}
891
892#[cfg(test)]
893mod tests {
894    use super::*;
895
896    #[test]
897    fn test_graph_creation() {
898        let graph = DependencyGraph::new();
899        assert_eq!(graph.node_count(), 0);
900        assert_eq!(graph.edge_count(), 0);
901    }
902
903    #[test]
904    fn test_node_operations() {
905        let mut graph = DependencyGraph::new();
906
907        // Add nodes
908        graph.add_node("main.py".to_string()).unwrap();
909        graph.add_node("utils.py".to_string()).unwrap();
910
911        assert_eq!(graph.node_count(), 2);
912        assert!(graph.contains_node(&"main.py".to_string()));
913        assert!(graph.contains_node(&"utils.py".to_string()));
914
915        // Remove node
916        let removed = graph.remove_node(&"utils.py".to_string()).unwrap();
917        assert!(removed);
918        assert_eq!(graph.node_count(), 1);
919        assert!(!graph.contains_node(&"utils.py".to_string()));
920    }
921
922    #[test]
923    fn test_edge_operations() {
924        let mut graph = DependencyGraph::new();
925
926        // Add edge (automatically creates nodes)
927        graph
928            .add_edge("main.py".to_string(), "utils.py".to_string())
929            .unwrap();
930
931        assert_eq!(graph.node_count(), 2);
932        assert_eq!(graph.edge_count(), 1);
933        assert!(graph.contains_edge(&"main.py".to_string(), &"utils.py".to_string()));
934
935        // Check degrees
936        assert_eq!(graph.out_degree(&"main.py".to_string()), 1);
937        assert_eq!(graph.in_degree(&"utils.py".to_string()), 1);
938        assert_eq!(graph.in_degree(&"main.py".to_string()), 0);
939        assert_eq!(graph.out_degree(&"utils.py".to_string()), 0);
940    }
941
942    #[test]
943    fn test_multiple_edges() {
944        let mut graph = DependencyGraph::new();
945
946        let edges = vec![
947            ("main.py".to_string(), "utils.py".to_string()),
948            ("main.py".to_string(), "config.py".to_string()),
949            ("utils.py".to_string(), "config.py".to_string()),
950        ];
951
952        graph.add_edges(&edges).unwrap();
953
954        assert_eq!(graph.node_count(), 3);
955        assert_eq!(graph.edge_count(), 3);
956
957        // main.py should have out-degree 2
958        assert_eq!(graph.out_degree(&"main.py".to_string()), 2);
959
960        // config.py should have in-degree 2
961        assert_eq!(graph.in_degree(&"config.py".to_string()), 2);
962    }
963
964    #[test]
965    fn test_node_metadata() {
966        let mut graph = DependencyGraph::new();
967
968        let metadata = NodeMetadata::new("main.py".to_string()).with_size(1024);
969        graph
970            .add_node_with_metadata("main.py".to_string(), metadata)
971            .unwrap();
972
973        let retrieved = graph.node_metadata(&"main.py".to_string()).unwrap();
974        assert_eq!(retrieved.file_path, "main.py");
975        assert_eq!(retrieved.language, Some("python".to_string()));
976        assert!(retrieved.is_entrypoint);
977        assert!(!retrieved.is_test);
978        assert_eq!(retrieved.size_bytes, 1024);
979    }
980
981    // Note: Language detection and file classification tests removed
982    // as they depend on functions that don't exist in this module
983
984    #[test]
985    fn test_pagerank_iterator() {
986        let mut graph = DependencyGraph::new();
987
988        // Build a small graph: A -> B -> C, C -> A (creates cycle)
989        graph.add_edge("A".to_string(), "B".to_string()).unwrap();
990        graph.add_edge("B".to_string(), "C".to_string()).unwrap();
991        graph.add_edge("C".to_string(), "A".to_string()).unwrap();
992
993        let pagerank_data: Vec<_> = graph.pagerank_iterator().collect();
994        assert_eq!(pagerank_data.len(), 3);
995
996        // Each node should have incoming edges (reverse edges)
997        for (node, reverse_edges) in pagerank_data {
998            assert!(reverse_edges.is_some());
999            assert!(!reverse_edges.unwrap().is_empty());
1000        }
1001    }
1002
1003    #[test]
1004    fn test_dangling_nodes() {
1005        let mut graph = DependencyGraph::new();
1006
1007        // A -> B, C is isolated, D has no outgoing edges
1008        graph.add_edge("A".to_string(), "B".to_string()).unwrap();
1009        graph.add_node("C".to_string()).unwrap();
1010        graph.add_edge("B".to_string(), "D".to_string()).unwrap();
1011
1012        let dangling = graph.dangling_nodes();
1013
1014        // C and D should be dangling (no outgoing edges)
1015        assert_eq!(dangling.len(), 2);
1016        assert!(dangling.contains(&&"C".to_string()));
1017        assert!(dangling.contains(&&"D".to_string()));
1018    }
1019
1020    #[test]
1021    fn test_concurrent_graph() {
1022        let mut graph = DependencyGraph::new();
1023        graph.add_edge("A".to_string(), "B".to_string()).unwrap();
1024        graph.add_edge("B".to_string(), "C".to_string()).unwrap();
1025
1026        let concurrent = graph.into_concurrent();
1027
1028        // Test concurrent operations
1029        assert_eq!(concurrent.in_degree(&"B".to_string()), 1);
1030        assert_eq!(concurrent.out_degree(&"B".to_string()), 1);
1031
1032        // Add node concurrently
1033        concurrent.add_node("D".to_string()).unwrap();
1034
1035        // Convert back to sequential
1036        let sequential = concurrent.into_sequential();
1037        assert_eq!(sequential.node_count(), 4); // A, B, C, D
1038    }
1039
1040    #[test]
1041    fn test_scc_estimation() {
1042        let mut graph = DependencyGraph::new();
1043
1044        // Create a graph with potential cycles: A <-> B, C -> D
1045        graph.add_edge("A".to_string(), "B".to_string()).unwrap();
1046        graph.add_edge("B".to_string(), "A".to_string()).unwrap();
1047        graph.add_edge("C".to_string(), "D".to_string()).unwrap();
1048        graph.add_node("E".to_string()).unwrap(); // Isolated
1049
1050        let scc_count = graph.estimate_scc_count();
1051
1052        // Should estimate: 1 SCC (A,B have both in/out), plus isolated/chain nodes (C,D,E)
1053        assert!(scc_count >= 3); // At least C, D, E as separate components
1054    }
1055
1056    #[test]
1057    fn test_nodes_by_language() {
1058        let mut graph = DependencyGraph::new();
1059
1060        graph.add_node("main.py".to_string()).unwrap();
1061        graph.add_node("utils.py".to_string()).unwrap();
1062        graph.add_node("app.js".to_string()).unwrap();
1063        graph.add_node("lib.rs".to_string()).unwrap();
1064
1065        let python_nodes = graph.nodes_by_language("python");
1066        let js_nodes = graph.nodes_by_language("javascript");
1067        let rust_nodes = graph.nodes_by_language("rust");
1068
1069        assert_eq!(python_nodes.len(), 2);
1070        assert_eq!(js_nodes.len(), 1);
1071        assert_eq!(rust_nodes.len(), 1);
1072
1073        assert!(python_nodes.contains(&&"main.py".to_string()));
1074        assert!(python_nodes.contains(&&"utils.py".to_string()));
1075    }
1076
1077    #[test]
1078    fn test_transitive_dependencies() {
1079        let mut graph = DependencyGraph::new();
1080
1081        // Create dependency chain: A -> B -> C -> D
1082        graph.add_edge("A".to_string(), "B".to_string()).unwrap();
1083        graph.add_edge("B".to_string(), "C".to_string()).unwrap();
1084        graph.add_edge("C".to_string(), "D".to_string()).unwrap();
1085
1086        // A should transitively depend on B, C, D
1087        let deps = graph.transitive_dependencies(&"A".to_string(), None);
1088        assert_eq!(deps.len(), 3);
1089        assert!(deps.contains(&"B".to_string()));
1090        assert!(deps.contains(&"C".to_string()));
1091        assert!(deps.contains(&"D".to_string()));
1092
1093        // B should transitively depend on C, D
1094        let deps = graph.transitive_dependencies(&"B".to_string(), None);
1095        assert_eq!(deps.len(), 2);
1096        assert!(deps.contains(&"C".to_string()));
1097        assert!(deps.contains(&"D".to_string()));
1098
1099        // D has no dependencies
1100        let deps = graph.transitive_dependencies(&"D".to_string(), None);
1101        assert_eq!(deps.len(), 0);
1102    }
1103
1104    #[test]
1105    fn test_transitive_dependencies_with_depth_limit() {
1106        let mut graph = DependencyGraph::new();
1107
1108        // Create dependency chain: A -> B -> C -> D
1109        graph.add_edge("A".to_string(), "B".to_string()).unwrap();
1110        graph.add_edge("B".to_string(), "C".to_string()).unwrap();
1111        graph.add_edge("C".to_string(), "D".to_string()).unwrap();
1112
1113        // Limit to depth 1: only direct dependencies
1114        let deps = graph.transitive_dependencies(&"A".to_string(), Some(1));
1115        assert_eq!(deps.len(), 1);
1116        assert!(deps.contains(&"B".to_string()));
1117
1118        // Limit to depth 2
1119        let deps = graph.transitive_dependencies(&"A".to_string(), Some(2));
1120        assert_eq!(deps.len(), 2);
1121        assert!(deps.contains(&"B".to_string()));
1122        assert!(deps.contains(&"C".to_string()));
1123    }
1124
1125    #[test]
1126    fn test_transitive_dependents() {
1127        let mut graph = DependencyGraph::new();
1128
1129        // Create dependency chain: A -> B -> C -> D
1130        // D is depended on by C, B (transitively), A (transitively)
1131        graph.add_edge("A".to_string(), "B".to_string()).unwrap();
1132        graph.add_edge("B".to_string(), "C".to_string()).unwrap();
1133        graph.add_edge("C".to_string(), "D".to_string()).unwrap();
1134
1135        // D should have transitive dependents: C, B, A
1136        let dependents = graph.transitive_dependents(&"D".to_string(), None);
1137        assert_eq!(dependents.len(), 3);
1138        assert!(dependents.contains(&"C".to_string()));
1139        assert!(dependents.contains(&"B".to_string()));
1140        assert!(dependents.contains(&"A".to_string()));
1141
1142        // C should have transitive dependents: B, A
1143        let dependents = graph.transitive_dependents(&"C".to_string(), None);
1144        assert_eq!(dependents.len(), 2);
1145        assert!(dependents.contains(&"B".to_string()));
1146        assert!(dependents.contains(&"A".to_string()));
1147
1148        // A has no dependents
1149        let dependents = graph.transitive_dependents(&"A".to_string(), None);
1150        assert_eq!(dependents.len(), 0);
1151    }
1152
1153    #[test]
1154    fn test_compute_closure_dependencies() {
1155        let mut graph = DependencyGraph::new();
1156
1157        // Create a diamond dependency: A -> B, A -> C, B -> D, C -> D
1158        graph.add_edge("A".to_string(), "B".to_string()).unwrap();
1159        graph.add_edge("A".to_string(), "C".to_string()).unwrap();
1160        graph.add_edge("B".to_string(), "D".to_string()).unwrap();
1161        graph.add_edge("C".to_string(), "D".to_string()).unwrap();
1162
1163        // Closure of A should include A, B, C, D
1164        let closure = graph.compute_closure(
1165            &["A".to_string()],
1166            TraversalDirection::Dependencies,
1167            None,
1168        );
1169        assert_eq!(closure.len(), 4);
1170        assert!(closure.contains(&"A".to_string()));
1171        assert!(closure.contains(&"B".to_string()));
1172        assert!(closure.contains(&"C".to_string()));
1173        assert!(closure.contains(&"D".to_string()));
1174    }
1175
1176    #[test]
1177    fn test_compute_closure_both_directions() {
1178        let mut graph = DependencyGraph::new();
1179
1180        // Create: A -> B -> C
1181        graph.add_edge("A".to_string(), "B".to_string()).unwrap();
1182        graph.add_edge("B".to_string(), "C".to_string()).unwrap();
1183
1184        // Closure of B in both directions should include A, B, C
1185        let closure = graph.compute_closure(
1186            &["B".to_string()],
1187            TraversalDirection::Both,
1188            None,
1189        );
1190        assert_eq!(closure.len(), 3);
1191        assert!(closure.contains(&"A".to_string()));
1192        assert!(closure.contains(&"B".to_string()));
1193        assert!(closure.contains(&"C".to_string()));
1194    }
1195
1196    #[test]
1197    fn test_compute_closure_multiple_seeds() {
1198        let mut graph = DependencyGraph::new();
1199
1200        // Create two separate chains: A -> B and C -> D
1201        graph.add_edge("A".to_string(), "B".to_string()).unwrap();
1202        graph.add_edge("C".to_string(), "D".to_string()).unwrap();
1203
1204        // Closure of [A, C] should include all four nodes
1205        let closure = graph.compute_closure(
1206            &["A".to_string(), "C".to_string()],
1207            TraversalDirection::Dependencies,
1208            None,
1209        );
1210        assert_eq!(closure.len(), 4);
1211        assert!(closure.contains(&"A".to_string()));
1212        assert!(closure.contains(&"B".to_string()));
1213        assert!(closure.contains(&"C".to_string()));
1214        assert!(closure.contains(&"D".to_string()));
1215    }
1216}