sqlitegraph/algo/community.rs
1//! Community detection algorithms for graph analysis.
2//!
3//! This module provides algorithms for discovering communities (clusters) in graphs.
4//! Community detection groups nodes that are more densely connected to each other
5//! than to nodes outside the group.
6//!
7//! # Available Algorithms
8//!
9//! - [`label_propagation`] - Fast label propagation for community discovery
10//! - [`louvain_communities`] - Louvain method for modularity optimization
11//!
12//! # When to Use Community Detection
13//!
14//! - **Label Propagation**: Fast community detection on large graphs, exploratory
15//! analysis where speed matters more than quality, baseline comparison for other
16//! clustering methods, incremental clustering where results update frequently
17//! - **Louvain**: High-quality community detection where modularity matters,
18//! hierarchical clustering to reveal multi-scale structure, research applications
19//! requiring reproducible results, final clustering when offline computation is
20//! acceptable
21
22use ahash::AHashMap;
23
24use crate::progress::ProgressCallback;
25use crate::{errors::SqliteGraphError, graph::SqliteGraph};
26
27/// Label Propagation algorithm for fast community detection.
28///
29/// Each node starts with its own label, then iteratively adopts the most frequent
30/// label among its neighbors. Converges when no labels change or max_iterations reached.
31///
32/// This is a near-linear time algorithm suitable for large graphs. Uses deterministic
33/// tiebreaking (smallest label wins) for reproducible results.
34///
35/// # Arguments
36/// * `graph` - The graph to analyze
37/// * `max_iterations` - Maximum number of iterations to prevent infinite loops (typically 5-10)
38///
39/// # Returns
40/// Communities as vectors of node IDs, sorted by smallest node ID in each community.
41///
42/// # Complexity
43/// Time: O(k * |E|) where k = iterations (typically 5-10)
44/// Space: O(|V|) for label storage
45///
46/// # Algorithm Details
47/// - Initialize each node with unique label (node ID)
48/// - Iteratively adopt most frequent neighbor label
49/// - Bidirectional edges (both incoming and outgoing neighbors)
50/// - Deterministic tiebreaking: smallest label wins
51/// - Early stopping when converged (no labels change)
52///
53/// # References
54/// - Raghavan, U. N., Albert, R., & Kumara, S. (2007). "Near linear time algorithm to detect community structures in large-scale networks."
55///
56/// # Example
57/// ```rust
58/// use sqlitegraph::{SqliteGraph, algo::label_propagation};
59/// let graph = SqliteGraph::open_in_memory()?;
60/// // ... add nodes and edges ...
61/// let communities = label_propagation(&graph, 10)?;
62/// ```
63pub fn label_propagation(
64 graph: &SqliteGraph,
65 max_iterations: usize,
66) -> Result<Vec<Vec<i64>>, SqliteGraphError> {
67 let all_ids = graph.all_entity_ids()?;
68
69 if all_ids.is_empty() {
70 return Ok(Vec::new());
71 }
72
73 // Initialize: each node gets its own label
74 let mut labels: AHashMap<i64, i64> = all_ids.iter().map(|&id| (id, id)).collect();
75
76 // For deterministic results, process nodes in sorted order
77 let mut node_order: Vec<i64> = all_ids.clone();
78 node_order.sort();
79
80 // Iterative label propagation
81 for _iteration in 0..max_iterations {
82 let mut any_changed = false;
83
84 for &node in &node_order {
85 // Count neighbor labels
86 let mut label_counts: AHashMap<i64, usize> = AHashMap::new();
87
88 // Count outgoing neighbors
89 for &neighbor in &graph.fetch_outgoing(node)? {
90 let neighbor_label = labels.get(&neighbor).unwrap_or(&neighbor);
91 *label_counts.entry(*neighbor_label).or_insert(0) += 1;
92 }
93
94 // Count incoming neighbors
95 for &neighbor in &graph.fetch_incoming(node)? {
96 let neighbor_label = labels.get(&neighbor).unwrap_or(&neighbor);
97 *label_counts.entry(*neighbor_label).or_insert(0) += 1;
98 }
99
100 // Find most frequent label (deterministic tiebreak: smallest label)
101 if let Some((&_most_frequent_label, _)) = label_counts
102 .iter()
103 .max_by_key(|(_, count)| *count)
104 .map(|(label, count)| (label, *count))
105 {
106 // In case of ties, max_by_key returns arbitrary one
107 // So we need to find all with max count and take smallest label
108 let max_count = *label_counts.values().max().unwrap_or(&0);
109 let best_label = label_counts
110 .iter()
111 .filter(|(_, count)| **count == max_count)
112 .map(|(&label, _)| label)
113 .min()
114 .unwrap_or(node);
115
116 if let Some(current_label) = labels.get(&node) {
117 if *current_label != best_label {
118 labels.insert(node, best_label);
119 any_changed = true;
120 }
121 }
122 }
123 }
124
125 if !any_changed {
126 break;
127 }
128 }
129
130 // Group nodes by final label
131 let mut communities_map: AHashMap<i64, Vec<i64>> = AHashMap::new();
132 for (node, label) in &labels {
133 communities_map
134 .entry(*label)
135 .or_insert_with(Vec::new)
136 .push(*node);
137 }
138
139 // Convert to sorted vector of communities
140 let mut communities: Vec<Vec<i64>> = communities_map.into_values().collect();
141 for community in &mut communities {
142 community.sort();
143 }
144 communities.sort_by(|a, b| a.first().cmp(&b.first()));
145
146 Ok(communities)
147}
148
149/// Louvain method for community detection via modularity optimization.
150///
151/// Iteratively moves nodes to maximize modularity (how many edges are within
152/// communities vs between communities). This is a simplified single-pass version.
153///
154/// # Arguments
155/// * `graph` - The graph to analyze
156/// * `max_iterations` - Maximum number of iterations to prevent infinite loops (typically 10-20)
157///
158/// # Returns
159/// Communities as vectors of node IDs, sorted by smallest node ID in each community.
160///
161/// # Complexity
162/// Time: O(k * |V| * |E|) where k = iterations
163/// Space: O(|V|) for community assignments and degrees
164///
165/// # Algorithm Details
166/// Simplified single-pass modularity optimization (no multi-level aggregation):
167/// 1. Initialize each node in its own community
168/// 2. Calculate total edges (m) and node degrees
169/// 3. Iteratively move nodes to maximize modularity delta:
170/// ΔQ = (2*edges_to_community - node_degree*community_degree/m) / (2*m)
171/// 4. Stop when no moves improve modularity
172///
173/// Modularity measures edge density within communities vs random expectation.
174/// Higher values indicate better community structure (typical range: 0.3-0.7).
175///
176/// # Caveats
177/// - Simplified version (no multi-level aggregation)
178/// - May converge to local optima (not guaranteed global optimum)
179/// - Performance depends on graph structure and edge distribution
180///
181/// # References
182/// - Blondel, V. D., et al. (2008). "Fast unfolding of communities in large networks."
183///
184/// # Example
185/// ```rust
186/// use sqlitegraph::{SqliteGraph, algo::louvain_communities};
187/// let graph = SqliteGraph::open_in_memory()?;
188/// // ... add nodes and edges ...
189/// let communities = louvain_communities(&graph, 10)?;
190/// ```
191pub fn louvain_communities(
192 graph: &SqliteGraph,
193 max_iterations: usize,
194) -> Result<Vec<Vec<i64>>, SqliteGraphError> {
195 let all_ids = graph.all_entity_ids()?;
196
197 if all_ids.is_empty() {
198 return Ok(Vec::new());
199 }
200
201 // Calculate total edges (m) and node degrees
202 let mut total_edges = 0usize;
203 let mut degrees: AHashMap<i64, usize> = AHashMap::new();
204
205 for &id in &all_ids {
206 let out_count = graph.fetch_outgoing(id)?.len();
207 let in_count = graph.fetch_incoming(id)?.len();
208 let degree = out_count + in_count;
209 degrees.insert(id, degree);
210 total_edges += degree;
211 }
212
213 // Total edges m (undirected: each edge counted twice, so m = sum_degrees / 2)
214 let m = total_edges as f64 / 2.0;
215
216 if m == 0.0 {
217 // No edges - each node is its own community
218 let mut communities: Vec<Vec<i64>> = all_ids.iter().map(|&id| vec![id]).collect();
219 communities.sort();
220 return Ok(communities);
221 }
222
223 // Initialize: each node in its own community
224 let mut communities: AHashMap<i64, i64> = all_ids.iter().map(|&id| (id, id)).collect();
225
226 // For deterministic results, process nodes in sorted order
227 let mut node_order: Vec<i64> = all_ids.clone();
228 node_order.sort();
229
230 // Iterative modularity optimization
231 for _iteration in 0..max_iterations {
232 let mut any_moved = false;
233
234 for &node in &node_order {
235 let current_community = *communities.get(&node).unwrap_or(&node);
236 let node_degree = *degrees.get(&node).unwrap_or(&0) as f64;
237
238 // Find neighbor communities
239 let mut community_connections: AHashMap<i64, f64> = AHashMap::new();
240
241 // Count outgoing edges
242 for &neighbor in &graph.fetch_outgoing(node)? {
243 let neighbor_community = *communities.get(&neighbor).unwrap_or(&neighbor);
244 *community_connections
245 .entry(neighbor_community)
246 .or_insert(0.0) += 1.0;
247 }
248
249 // Count incoming edges
250 for &neighbor in &graph.fetch_incoming(node)? {
251 let neighbor_community = *communities.get(&neighbor).unwrap_or(&neighbor);
252 *community_connections
253 .entry(neighbor_community)
254 .or_insert(0.0) += 1.0;
255 }
256
257 // Calculate modularity delta for moving to each neighbor's community
258 let mut best_community = current_community;
259 let mut best_delta = 0.0f64;
260
261 for (&target_community, &edges_to_community) in &community_connections {
262 if target_community == current_community {
263 continue;
264 }
265
266 // Calculate sum of degrees in target community
267 let community_degree: f64 = communities
268 .iter()
269 .filter(|(_, comm)| **comm == target_community)
270 .map(|(&node, _)| *degrees.get(&node).unwrap_or(&0) as f64)
271 .sum();
272
273 // Modularity delta formula:
274 // ΔQ = (edges_in / m) - (edges_total / m)^2
275 // Simplified for single node move:
276 // ΔQ = [(2*edges_to_community - node_degree*community_degree/m) / (2*m)]
277
278 let delta =
279 (2.0 * edges_to_community - node_degree * community_degree / m) / (2.0 * m);
280
281 if delta > best_delta {
282 best_delta = delta;
283 best_community = target_community;
284 }
285 }
286
287 // Move node if it improves modularity
288 if best_community != current_community {
289 communities.insert(node, best_community);
290 any_moved = true;
291 }
292 }
293
294 if !any_moved {
295 break;
296 }
297 }
298
299 // Group nodes by final community
300 let mut communities_map: AHashMap<i64, Vec<i64>> = AHashMap::new();
301 for (node, community) in &communities {
302 communities_map
303 .entry(*community)
304 .or_insert_with(Vec::new)
305 .push(*node);
306 }
307
308 // Convert to sorted vector of communities
309 let mut result: Vec<Vec<i64>> = communities_map.into_values().collect();
310 for community in &mut result {
311 community.sort();
312 }
313 result.sort_by(|a, b| a.first().cmp(&b.first()));
314
315 Ok(result)
316}
317
318/// Louvain method for community detection with progress callback reporting.
319///
320/// This is the progress-reporting variant of [`louvain_communities`]. See that function
321/// for full algorithm documentation.
322///
323/// # Arguments
324/// * `graph` - The graph to analyze
325/// * `max_iterations` - Maximum number of iterations
326/// * `progress` - Callback for progress updates
327///
328/// # Progress Reporting
329/// - Reports progress for each iteration pass: "Louvain pass X"
330/// - Total is None (convergence unknown)
331/// - Calls `on_complete()` when finished or converged
332/// - Calls `on_error()` if an error occurs
333///
334/// # Example
335///
336/// ```rust
337/// use sqlitegraph::{SqliteGraph, algo::louvain_communities_with_progress};
338/// use sqlitegraph::progress::NoProgress;
339///
340/// let graph = SqliteGraph::open_in_memory()?;
341/// // ... add nodes and edges ...
342/// let progress = NoProgress;
343/// let communities = louvain_communities_with_progress(&graph, 10, &progress)?;
344/// ```
345pub fn louvain_communities_with_progress<F>(
346 graph: &SqliteGraph,
347 max_iterations: usize,
348 progress: &F,
349) -> Result<Vec<Vec<i64>>, SqliteGraphError>
350where
351 F: ProgressCallback,
352{
353 let all_ids = graph.all_entity_ids()?;
354
355 if all_ids.is_empty() {
356 progress.on_complete();
357 return Ok(Vec::new());
358 }
359
360 // Calculate total edges (m) and node degrees
361 let mut total_edges = 0usize;
362 let mut degrees: AHashMap<i64, usize> = AHashMap::new();
363
364 for &id in &all_ids {
365 let out_count = graph.fetch_outgoing(id)?.len();
366 let in_count = graph.fetch_incoming(id)?.len();
367 let degree = out_count + in_count;
368 degrees.insert(id, degree);
369 total_edges += degree;
370 }
371
372 // Total edges m (undirected: each edge counted twice, so m = sum_degrees / 2)
373 let m = total_edges as f64 / 2.0;
374
375 if m == 0.0 {
376 progress.on_complete();
377 // No edges - each node is its own community
378 let mut communities: Vec<Vec<i64>> = all_ids.iter().map(|&id| vec![id]).collect();
379 communities.sort();
380 return Ok(communities);
381 }
382
383 // Initialize: each node in its own community
384 let mut communities: AHashMap<i64, i64> = all_ids.iter().map(|&id| (id, id)).collect();
385
386 // For deterministic results, process nodes in sorted order
387 let mut node_order: Vec<i64> = all_ids.clone();
388 node_order.sort();
389
390 // Iterative modularity optimization with progress reporting
391 for iteration in 0..max_iterations {
392 progress.on_progress(
393 iteration + 1,
394 None,
395 &format!("Louvain pass {}", iteration + 1),
396 );
397
398 let mut any_moved = false;
399
400 for &node in &node_order {
401 let current_community = *communities.get(&node).unwrap_or(&node);
402 let node_degree = *degrees.get(&node).unwrap_or(&0) as f64;
403
404 // Find neighbor communities
405 let mut community_connections: AHashMap<i64, f64> = AHashMap::new();
406
407 // Count outgoing edges
408 for &neighbor in &graph.fetch_outgoing(node)? {
409 let neighbor_community = *communities.get(&neighbor).unwrap_or(&neighbor);
410 *community_connections
411 .entry(neighbor_community)
412 .or_insert(0.0) += 1.0;
413 }
414
415 // Count incoming edges
416 for &neighbor in &graph.fetch_incoming(node)? {
417 let neighbor_community = *communities.get(&neighbor).unwrap_or(&neighbor);
418 *community_connections
419 .entry(neighbor_community)
420 .or_insert(0.0) += 1.0;
421 }
422
423 // Calculate modularity delta for moving to each neighbor's community
424 let mut best_community = current_community;
425 let mut best_delta = 0.0f64;
426
427 for (&target_community, &edges_to_community) in &community_connections {
428 if target_community == current_community {
429 continue;
430 }
431
432 // Calculate sum of degrees in target community
433 let community_degree: f64 = communities
434 .iter()
435 .filter(|(_, comm)| **comm == target_community)
436 .map(|(&node, _)| *degrees.get(&node).unwrap_or(&0) as f64)
437 .sum();
438
439 // Modularity delta formula:
440 // ΔQ = (edges_in / m) - (edges_total / m)^2
441 // Simplified for single node move:
442 // ΔQ = [(2*edges_to_community - node_degree*community_degree/m) / (2*m)]
443
444 let delta =
445 (2.0 * edges_to_community - node_degree * community_degree / m) / (2.0 * m);
446
447 if delta > best_delta {
448 best_delta = delta;
449 best_community = target_community;
450 }
451 }
452
453 // Move node if it improves modularity
454 if best_community != current_community {
455 communities.insert(node, best_community);
456 any_moved = true;
457 }
458 }
459
460 if !any_moved {
461 break;
462 }
463 }
464
465 progress.on_complete();
466
467 // Group nodes by final community
468 let mut communities_map: AHashMap<i64, Vec<i64>> = AHashMap::new();
469 for (node, community) in &communities {
470 communities_map
471 .entry(*community)
472 .or_insert_with(Vec::new)
473 .push(*node);
474 }
475
476 // Convert to sorted vector of communities
477 let mut result: Vec<Vec<i64>> = communities_map.into_values().collect();
478 for community in &mut result {
479 community.sort();
480 }
481 result.sort_by(|a, b| a.first().cmp(&b.first()));
482
483 Ok(result)
484}