Skip to main content

manifoldb_graph/analytics/
pagerank.rs

1//! PageRank algorithm implementation.
2//!
3//! This module implements the PageRank algorithm using the iterative power method.
4//! PageRank assigns an importance score to each node based on the link structure
5//! of the graph.
6//!
7//! # Algorithm
8//!
9//! PageRank uses an iterative approach:
10//! 1. Initialize all nodes with equal rank (1/N)
11//! 2. For each iteration:
12//!    - Each node distributes its rank equally among outgoing neighbors
13//!    - Apply damping factor to handle random jumps
14//!    - Check for convergence
15//! 3. Repeat until convergence or max iterations reached
16//!
17//! # Formula
18//!
19//! PR(u) = (1-d)/N + d * Σ(PR(v)/L(v)) for all v linking to u
20//!
21//! Where:
22//! - d is the damping factor (typically 0.85)
23//! - N is the total number of nodes
24//! - L(v) is the out-degree of node v
25//!
26//! # Example
27//!
28//! ```ignore
29//! use manifoldb_graph::analytics::{PageRank, PageRankConfig};
30//!
31//! let config = PageRankConfig::default()
32//!     .with_damping_factor(0.85)
33//!     .with_max_iterations(100)
34//!     .with_tolerance(1e-6);
35//!
36//! let result = PageRank::compute(&tx, &config)?;
37//!
38//! // Get top nodes by PageRank
39//! for (node, score) in result.top_n(10) {
40//!     println!("Node {:?}: {:.6}", node, score);
41//! }
42//! ```
43
44use std::collections::HashMap;
45
46use manifoldb_core::EntityId;
47use manifoldb_storage::Transaction;
48
49use crate::index::AdjacencyIndex;
50use crate::store::{EdgeStore, GraphError, GraphResult, NodeStore};
51
52/// Default maximum number of nodes for graph analytics operations.
53/// Set to 10 million nodes as a reasonable default for most systems.
54pub const DEFAULT_MAX_GRAPH_NODES: usize = 10_000_000;
55
56/// Configuration for PageRank algorithm.
57#[derive(Debug, Clone)]
58pub struct PageRankConfig {
59    /// Damping factor (probability of following a link vs random jump).
60    /// Default: 0.85
61    pub damping_factor: f64,
62
63    /// Maximum number of iterations before stopping.
64    /// Default: 100
65    pub max_iterations: usize,
66
67    /// Convergence tolerance. Algorithm stops when max score change < tolerance.
68    /// Default: 1e-6
69    pub tolerance: f64,
70
71    /// Whether to normalize scores to sum to 1.0.
72    /// Default: true
73    pub normalize: bool,
74
75    /// Maximum number of nodes allowed before returning an error.
76    /// Set to `None` to disable the check.
77    /// Default: 10,000,000 (10M nodes)
78    pub max_graph_nodes: Option<usize>,
79}
80
81impl Default for PageRankConfig {
82    fn default() -> Self {
83        Self {
84            damping_factor: 0.85,
85            max_iterations: 100,
86            tolerance: 1e-6,
87            normalize: true,
88            max_graph_nodes: Some(DEFAULT_MAX_GRAPH_NODES),
89        }
90    }
91}
92
93impl PageRankConfig {
94    /// Create a new configuration with default values.
95    pub fn new() -> Self {
96        Self::default()
97    }
98
99    /// Set the damping factor.
100    ///
101    /// The damping factor represents the probability that a random walker
102    /// follows a link instead of jumping to a random node.
103    /// Common values are 0.85 (default) or 0.9.
104    pub const fn with_damping_factor(mut self, damping_factor: f64) -> Self {
105        self.damping_factor = damping_factor;
106        self
107    }
108
109    /// Set the maximum number of iterations.
110    pub const fn with_max_iterations(mut self, max_iterations: usize) -> Self {
111        self.max_iterations = max_iterations;
112        self
113    }
114
115    /// Set the convergence tolerance.
116    ///
117    /// The algorithm stops when the maximum change in any node's score
118    /// between iterations is less than this value.
119    pub const fn with_tolerance(mut self, tolerance: f64) -> Self {
120        self.tolerance = tolerance;
121        self
122    }
123
124    /// Set whether to normalize scores to sum to 1.0.
125    pub const fn with_normalize(mut self, normalize: bool) -> Self {
126        self.normalize = normalize;
127        self
128    }
129
130    /// Set the maximum number of nodes allowed.
131    ///
132    /// If the graph has more nodes than this limit, the algorithm will
133    /// return a [`GraphError::GraphTooLarge`] error instead of attempting
134    /// to allocate potentially gigabytes of memory.
135    ///
136    /// Set to `None` to disable the check (use with caution).
137    ///
138    /// [`GraphError::GraphTooLarge`]: crate::store::GraphError::GraphTooLarge
139    pub const fn with_max_graph_nodes(mut self, limit: Option<usize>) -> Self {
140        self.max_graph_nodes = limit;
141        self
142    }
143}
144
145/// Result of PageRank computation.
146#[derive(Debug, Clone)]
147pub struct PageRankResult {
148    /// PageRank scores for each node.
149    pub scores: HashMap<EntityId, f64>,
150
151    /// Number of iterations performed.
152    pub iterations: usize,
153
154    /// Whether the algorithm converged within tolerance.
155    pub converged: bool,
156
157    /// Final convergence delta (max change in last iteration).
158    pub final_delta: f64,
159}
160
161impl PageRankResult {
162    /// Get the PageRank score for a specific node.
163    pub fn score(&self, node: EntityId) -> Option<f64> {
164        self.scores.get(&node).copied()
165    }
166
167    /// Get nodes sorted by PageRank score (descending).
168    pub fn sorted(&self) -> Vec<(EntityId, f64)> {
169        let mut pairs: Vec<_> = self.scores.iter().map(|(&id, &score)| (id, score)).collect();
170        pairs.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
171        pairs
172    }
173
174    /// Get the top N nodes by PageRank score.
175    pub fn top_n(&self, n: usize) -> Vec<(EntityId, f64)> {
176        self.sorted().into_iter().take(n).collect()
177    }
178
179    /// Get the node with the highest PageRank score.
180    pub fn max(&self) -> Option<(EntityId, f64)> {
181        self.scores
182            .iter()
183            .max_by(|a, b| a.1.partial_cmp(b.1).unwrap_or(std::cmp::Ordering::Equal))
184            .map(|(&id, &score)| (id, score))
185    }
186
187    /// Get the node with the lowest PageRank score.
188    pub fn min(&self) -> Option<(EntityId, f64)> {
189        self.scores
190            .iter()
191            .min_by(|a, b| a.1.partial_cmp(b.1).unwrap_or(std::cmp::Ordering::Equal))
192            .map(|(&id, &score)| (id, score))
193    }
194}
195
196/// PageRank algorithm implementation.
197///
198/// PageRank assigns importance scores to nodes based on the link structure
199/// of the graph. Nodes with many incoming links from important nodes will
200/// have higher scores.
201pub struct PageRank;
202
203impl PageRank {
204    /// Compute PageRank scores for all nodes in the graph.
205    ///
206    /// # Arguments
207    ///
208    /// * `tx` - The transaction to use for graph access
209    /// * `config` - Configuration parameters for the algorithm
210    ///
211    /// # Returns
212    ///
213    /// A `PageRankResult` containing scores for all nodes.
214    ///
215    /// # Algorithm
216    ///
217    /// Uses the iterative power method:
218    /// 1. Initialize all nodes with score 1/N
219    /// 2. Iteratively update scores based on incoming links
220    /// 3. Handle dangling nodes (nodes with no outgoing edges)
221    /// 4. Check for convergence after each iteration
222    pub fn compute<T: Transaction>(tx: &T, config: &PageRankConfig) -> GraphResult<PageRankResult> {
223        // Check graph size before allocating large data structures
224        if let Some(limit) = config.max_graph_nodes {
225            let node_count = NodeStore::count(tx)?;
226            if node_count > limit {
227                return Err(GraphError::GraphTooLarge { node_count, limit });
228            }
229        }
230
231        // Collect all nodes and build adjacency information
232        let mut nodes: Vec<EntityId> = Vec::new();
233        NodeStore::for_each(tx, |entity| {
234            nodes.push(entity.id);
235            true
236        })?;
237
238        let n = nodes.len();
239        if n == 0 {
240            return Ok(PageRankResult {
241                scores: HashMap::new(),
242                iterations: 0,
243                converged: true,
244                final_delta: 0.0,
245            });
246        }
247
248        // Build node index for fast lookup - pre-allocate with known size
249        let mut node_index: HashMap<EntityId, usize> = HashMap::with_capacity(n);
250        for (i, &id) in nodes.iter().enumerate() {
251            node_index.insert(id, i);
252        }
253
254        // Build outgoing edge lists and compute out-degrees
255        // Pre-allocate inner vecs with typical degree estimate
256        const AVG_DEGREE_ESTIMATE: usize = 8;
257        let mut out_neighbors: Vec<Vec<usize>> =
258            (0..n).map(|_| Vec::with_capacity(AVG_DEGREE_ESTIMATE)).collect();
259        let mut out_degrees: Vec<usize> = vec![0; n];
260
261        for (i, &node) in nodes.iter().enumerate() {
262            // Get outgoing edges
263            let outgoing = AdjacencyIndex::get_outgoing_edge_ids(tx, node)?;
264            out_degrees[i] = outgoing.len();
265
266            for edge_id in outgoing {
267                if let Some(edge) = EdgeStore::get(tx, edge_id)? {
268                    if let Some(&target_idx) = node_index.get(&edge.target) {
269                        out_neighbors[i].push(target_idx);
270                    }
271                }
272            }
273        }
274
275        // Build incoming edge lists for efficient iteration
276        // Pre-allocate inner vecs with typical degree estimate
277        let mut in_neighbors: Vec<Vec<usize>> =
278            (0..n).map(|_| Vec::with_capacity(AVG_DEGREE_ESTIMATE)).collect();
279        for (i, neighbors) in out_neighbors.iter().enumerate() {
280            for &j in neighbors {
281                in_neighbors[j].push(i);
282            }
283        }
284
285        // Initialize scores
286        let initial_score = 1.0 / n as f64;
287        let mut scores: Vec<f64> = vec![initial_score; n];
288        let mut new_scores: Vec<f64> = vec![0.0; n];
289
290        let d = config.damping_factor;
291        let base_score = (1.0 - d) / n as f64;
292
293        let mut iterations = 0;
294        let mut converged = false;
295        let mut final_delta = f64::MAX;
296
297        // Iterative computation
298        while iterations < config.max_iterations {
299            iterations += 1;
300
301            // Calculate dangling node contribution (nodes with no outgoing edges)
302            let dangling_sum: f64 = nodes
303                .iter()
304                .enumerate()
305                .filter(|(i, _)| out_degrees[*i] == 0)
306                .map(|(i, _)| scores[i])
307                .sum();
308            let dangling_contribution = d * dangling_sum / n as f64;
309
310            // Calculate new scores
311            for (i, incoming) in in_neighbors.iter().enumerate() {
312                let mut link_sum = 0.0;
313                for &j in incoming {
314                    if out_degrees[j] > 0 {
315                        link_sum += scores[j] / out_degrees[j] as f64;
316                    }
317                }
318                new_scores[i] = base_score + d * link_sum + dangling_contribution;
319            }
320
321            // Check convergence
322            let mut max_delta = 0.0f64;
323            for i in 0..n {
324                let delta = (new_scores[i] - scores[i]).abs();
325                max_delta = max_delta.max(delta);
326            }
327
328            final_delta = max_delta;
329
330            // Swap scores
331            std::mem::swap(&mut scores, &mut new_scores);
332
333            if max_delta < config.tolerance {
334                converged = true;
335                break;
336            }
337        }
338
339        // Normalize if requested
340        if config.normalize {
341            let total: f64 = scores.iter().sum();
342            if total > 0.0 {
343                for score in &mut scores {
344                    *score /= total;
345                }
346            }
347        }
348
349        // Build result map
350        let scores_map: HashMap<EntityId, f64> = nodes.into_iter().zip(scores).collect();
351
352        Ok(PageRankResult { scores: scores_map, iterations, converged, final_delta })
353    }
354
355    /// Compute PageRank for a subset of nodes.
356    ///
357    /// This is useful for computing PageRank on a subgraph defined by
358    /// a specific set of nodes.
359    ///
360    /// # Arguments
361    ///
362    /// * `tx` - The transaction to use for graph access
363    /// * `nodes` - The nodes to include in the computation
364    /// * `config` - Configuration parameters for the algorithm
365    pub fn compute_for_nodes<T: Transaction>(
366        tx: &T,
367        nodes: &[EntityId],
368        config: &PageRankConfig,
369    ) -> GraphResult<PageRankResult> {
370        let n = nodes.len();
371        if n == 0 {
372            return Ok(PageRankResult {
373                scores: HashMap::new(),
374                iterations: 0,
375                converged: true,
376                final_delta: 0.0,
377            });
378        }
379
380        // Build node index for fast lookup - pre-allocate with known size
381        let mut node_set: std::collections::HashSet<EntityId> =
382            std::collections::HashSet::with_capacity(n);
383        for &id in nodes {
384            node_set.insert(id);
385        }
386        let mut node_index: HashMap<EntityId, usize> = HashMap::with_capacity(n);
387        for (i, &id) in nodes.iter().enumerate() {
388            node_index.insert(id, i);
389        }
390
391        // Build adjacency information for the subgraph
392        // Pre-allocate inner vecs with typical degree estimate
393        const AVG_DEGREE_ESTIMATE: usize = 8;
394        let mut out_neighbors: Vec<Vec<usize>> =
395            (0..n).map(|_| Vec::with_capacity(AVG_DEGREE_ESTIMATE)).collect();
396        let mut out_degrees: Vec<usize> = vec![0; n];
397
398        for (i, &node) in nodes.iter().enumerate() {
399            let outgoing = AdjacencyIndex::get_outgoing_edge_ids(tx, node)?;
400
401            for edge_id in outgoing {
402                if let Some(edge) = EdgeStore::get(tx, edge_id)? {
403                    // Only include edges to nodes in the subgraph
404                    if node_set.contains(&edge.target) {
405                        if let Some(&target_idx) = node_index.get(&edge.target) {
406                            out_neighbors[i].push(target_idx);
407                            out_degrees[i] += 1;
408                        }
409                    }
410                }
411            }
412        }
413
414        // Build incoming edge lists
415        // Pre-allocate inner vecs with typical degree estimate
416        let mut in_neighbors: Vec<Vec<usize>> =
417            (0..n).map(|_| Vec::with_capacity(AVG_DEGREE_ESTIMATE)).collect();
418        for (i, neighbors) in out_neighbors.iter().enumerate() {
419            for &j in neighbors {
420                in_neighbors[j].push(i);
421            }
422        }
423
424        // Initialize scores
425        let initial_score = 1.0 / n as f64;
426        let mut scores: Vec<f64> = vec![initial_score; n];
427        let mut new_scores: Vec<f64> = vec![0.0; n];
428
429        let d = config.damping_factor;
430        let base_score = (1.0 - d) / n as f64;
431
432        let mut iterations = 0;
433        let mut converged = false;
434        let mut final_delta = f64::MAX;
435
436        while iterations < config.max_iterations {
437            iterations += 1;
438
439            // Calculate dangling node contribution
440            let dangling_sum: f64 =
441                (0..n).filter(|&i| out_degrees[i] == 0).map(|i| scores[i]).sum();
442            let dangling_contribution = d * dangling_sum / n as f64;
443
444            // Calculate new scores
445            for (i, incoming) in in_neighbors.iter().enumerate() {
446                let mut link_sum = 0.0;
447                for &j in incoming {
448                    if out_degrees[j] > 0 {
449                        link_sum += scores[j] / out_degrees[j] as f64;
450                    }
451                }
452                new_scores[i] = base_score + d * link_sum + dangling_contribution;
453            }
454
455            // Check convergence
456            let mut max_delta = 0.0f64;
457            for i in 0..n {
458                let delta = (new_scores[i] - scores[i]).abs();
459                max_delta = max_delta.max(delta);
460            }
461
462            final_delta = max_delta;
463            std::mem::swap(&mut scores, &mut new_scores);
464
465            if max_delta < config.tolerance {
466                converged = true;
467                break;
468            }
469        }
470
471        // Normalize if requested
472        if config.normalize {
473            let total: f64 = scores.iter().sum();
474            if total > 0.0 {
475                for score in &mut scores {
476                    *score /= total;
477                }
478            }
479        }
480
481        // Build result map
482        let scores_map: HashMap<EntityId, f64> = nodes.iter().copied().zip(scores).collect();
483
484        Ok(PageRankResult { scores: scores_map, iterations, converged, final_delta })
485    }
486}
487
488#[cfg(test)]
489mod tests {
490    use super::*;
491
492    #[test]
493    fn config_defaults() {
494        let config = PageRankConfig::default();
495        assert!((config.damping_factor - 0.85).abs() < f64::EPSILON);
496        assert_eq!(config.max_iterations, 100);
497        assert!((config.tolerance - 1e-6).abs() < f64::EPSILON);
498        assert!(config.normalize);
499    }
500
501    #[test]
502    fn config_builder() {
503        let config = PageRankConfig::new()
504            .with_damping_factor(0.9)
505            .with_max_iterations(50)
506            .with_tolerance(1e-8)
507            .with_normalize(false);
508
509        assert!((config.damping_factor - 0.9).abs() < f64::EPSILON);
510        assert_eq!(config.max_iterations, 50);
511        assert!((config.tolerance - 1e-8).abs() < f64::EPSILON);
512        assert!(!config.normalize);
513    }
514
515    #[test]
516    fn result_empty() {
517        let result = PageRankResult {
518            scores: HashMap::new(),
519            iterations: 0,
520            converged: true,
521            final_delta: 0.0,
522        };
523
524        assert!(result.score(EntityId::new(1)).is_none());
525        assert!(result.sorted().is_empty());
526        assert!(result.top_n(10).is_empty());
527        assert!(result.max().is_none());
528        assert!(result.min().is_none());
529    }
530
531    #[test]
532    fn result_sorted() {
533        let mut scores = HashMap::new();
534        scores.insert(EntityId::new(1), 0.3);
535        scores.insert(EntityId::new(2), 0.5);
536        scores.insert(EntityId::new(3), 0.2);
537
538        let result = PageRankResult { scores, iterations: 10, converged: true, final_delta: 1e-7 };
539
540        let sorted = result.sorted();
541        assert_eq!(sorted.len(), 3);
542        assert_eq!(sorted[0].0, EntityId::new(2)); // highest
543        assert_eq!(sorted[1].0, EntityId::new(1));
544        assert_eq!(sorted[2].0, EntityId::new(3)); // lowest
545    }
546
547    #[test]
548    fn result_top_n() {
549        let mut scores = HashMap::new();
550        scores.insert(EntityId::new(1), 0.3);
551        scores.insert(EntityId::new(2), 0.5);
552        scores.insert(EntityId::new(3), 0.2);
553
554        let result = PageRankResult { scores, iterations: 10, converged: true, final_delta: 1e-7 };
555
556        let top2 = result.top_n(2);
557        assert_eq!(top2.len(), 2);
558        assert_eq!(top2[0].0, EntityId::new(2));
559        assert_eq!(top2[1].0, EntityId::new(1));
560    }
561
562    #[test]
563    fn result_max_min() {
564        let mut scores = HashMap::new();
565        scores.insert(EntityId::new(1), 0.3);
566        scores.insert(EntityId::new(2), 0.5);
567        scores.insert(EntityId::new(3), 0.2);
568
569        let result = PageRankResult { scores, iterations: 10, converged: true, final_delta: 1e-7 };
570
571        let max = result.max().unwrap();
572        assert_eq!(max.0, EntityId::new(2));
573        assert!((max.1 - 0.5).abs() < f64::EPSILON);
574
575        let min = result.min().unwrap();
576        assert_eq!(min.0, EntityId::new(3));
577        assert!((min.1 - 0.2).abs() < f64::EPSILON);
578    }
579}