ruvector_math/spectral/
clustering.rs

1//! Spectral Clustering
2//!
3//! Graph partitioning using spectral methods.
4//! Efficient approximation via Chebyshev polynomials.
5
6use super::ScaledLaplacian;
7
8/// Spectral clustering configuration
9#[derive(Debug, Clone)]
10pub struct ClusteringConfig {
11    /// Number of clusters
12    pub k: usize,
13    /// Number of eigenvectors to use
14    pub num_eigenvectors: usize,
15    /// Power iteration steps for eigenvector approximation
16    pub power_iters: usize,
17    /// K-means iterations
18    pub kmeans_iters: usize,
19    /// Random seed
20    pub seed: u64,
21}
22
23impl Default for ClusteringConfig {
24    fn default() -> Self {
25        Self {
26            k: 2,
27            num_eigenvectors: 10,
28            power_iters: 50,
29            kmeans_iters: 20,
30            seed: 42,
31        }
32    }
33}
34
35/// Spectral clustering result
36#[derive(Debug, Clone)]
37pub struct ClusteringResult {
38    /// Cluster assignment for each vertex
39    pub assignments: Vec<usize>,
40    /// Eigenvector embedding (n × k)
41    pub embedding: Vec<Vec<f64>>,
42    /// Number of clusters
43    pub k: usize,
44}
45
46impl ClusteringResult {
47    /// Get vertices in cluster c
48    pub fn cluster(&self, c: usize) -> Vec<usize> {
49        self.assignments
50            .iter()
51            .enumerate()
52            .filter(|(_, &a)| a == c)
53            .map(|(i, _)| i)
54            .collect()
55    }
56
57    /// Cluster sizes
58    pub fn cluster_sizes(&self) -> Vec<usize> {
59        let mut sizes = vec![0; self.k];
60        for &a in &self.assignments {
61            if a < self.k {
62                sizes[a] += 1;
63            }
64        }
65        sizes
66    }
67}
68
69/// Spectral clustering
70#[derive(Debug, Clone)]
71pub struct SpectralClustering {
72    /// Configuration
73    config: ClusteringConfig,
74}
75
76impl SpectralClustering {
77    /// Create with configuration
78    pub fn new(config: ClusteringConfig) -> Self {
79        Self { config }
80    }
81
82    /// Create with just number of clusters
83    pub fn with_k(k: usize) -> Self {
84        Self::new(ClusteringConfig {
85            k,
86            num_eigenvectors: k,
87            ..Default::default()
88        })
89    }
90
91    /// Cluster graph using normalized Laplacian eigenvectors
92    pub fn cluster(&self, laplacian: &ScaledLaplacian) -> ClusteringResult {
93        let n = laplacian.n;
94        let k = self.config.k.min(n);
95        let num_eig = self.config.num_eigenvectors.min(n);
96
97        // Compute approximate eigenvectors of Laplacian
98        // We want the k smallest eigenvalues (smoothest eigenvectors)
99        // Use inverse power method on shifted Laplacian
100        let embedding = self.compute_embedding(laplacian, num_eig);
101
102        // Run k-means on embedding
103        let assignments = self.kmeans(&embedding, k);
104
105        ClusteringResult {
106            assignments,
107            embedding,
108            k,
109        }
110    }
111
112    /// Cluster using Fiedler vector (k=2)
113    pub fn bipartition(&self, laplacian: &ScaledLaplacian) -> ClusteringResult {
114        let n = laplacian.n;
115
116        // Compute Fiedler vector (second smallest eigenvector)
117        let fiedler = self.compute_fiedler(laplacian);
118
119        // Partition by sign
120        let assignments: Vec<usize> = fiedler.iter().map(|&v| if v >= 0.0 { 0 } else { 1 }).collect();
121
122        ClusteringResult {
123            assignments,
124            embedding: vec![fiedler],
125            k: 2,
126        }
127    }
128
129    /// Compute spectral embedding (k smallest non-trivial eigenvectors)
130    fn compute_embedding(&self, laplacian: &ScaledLaplacian, k: usize) -> Vec<Vec<f64>> {
131        let n = laplacian.n;
132        if k == 0 || n == 0 {
133            return vec![];
134        }
135
136        // Initialize random vectors
137        let mut vectors: Vec<Vec<f64>> = (0..k)
138            .map(|i| {
139                (0..n)
140                    .map(|j| {
141                        let x = ((j * 2654435769 + i * 1103515245 + self.config.seed as usize) as f64
142                            / 4294967296.0)
143                            * 2.0
144                            - 1.0;
145                        x
146                    })
147                    .collect()
148            })
149            .collect();
150
151        // Power iteration to find smallest eigenvectors
152        // We use (I - L_scaled) which has largest eigenvalue where L_scaled has smallest
153        for _ in 0..self.config.power_iters {
154            for i in 0..k {
155                // Apply (I - L_scaled) = (2I - L)/λ_max approximately
156                // Simpler: just use deflated power iteration on L for smallest
157                let mut y = vec![0.0; n];
158                let lx = laplacian.apply(&vectors[i]);
159
160                // We want small eigenvalues, so use (λ_max*I - L)
161                let shift = 2.0; // Approximate max eigenvalue of scaled Laplacian
162                for j in 0..n {
163                    y[j] = shift * vectors[i][j] - lx[j];
164                }
165
166                // Orthogonalize against previous vectors and constant vector
167                // First, remove constant component (eigenvalue 0)
168                let mean: f64 = y.iter().sum::<f64>() / n as f64;
169                for j in 0..n {
170                    y[j] -= mean;
171                }
172
173                // Then orthogonalize against previous eigenvectors
174                for prev in 0..i {
175                    let dot: f64 = y.iter().zip(vectors[prev].iter()).map(|(a, b)| a * b).sum();
176                    for j in 0..n {
177                        y[j] -= dot * vectors[prev][j];
178                    }
179                }
180
181                // Normalize
182                let norm: f64 = y.iter().map(|x| x * x).sum::<f64>().sqrt();
183                if norm > 1e-15 {
184                    for j in 0..n {
185                        y[j] /= norm;
186                    }
187                }
188
189                vectors[i] = y;
190            }
191        }
192
193        vectors
194    }
195
196    /// Compute Fiedler vector (second smallest eigenvector)
197    fn compute_fiedler(&self, laplacian: &ScaledLaplacian) -> Vec<f64> {
198        let embedding = self.compute_embedding(laplacian, 1);
199        if embedding.is_empty() {
200            return vec![0.0; laplacian.n];
201        }
202        embedding[0].clone()
203    }
204
205    /// K-means clustering on embedding
206    fn kmeans(&self, embedding: &[Vec<f64>], k: usize) -> Vec<usize> {
207        if embedding.is_empty() {
208            return vec![];
209        }
210
211        let n = embedding[0].len();
212        let dim = embedding.len();
213
214        if n == 0 || k == 0 {
215            return vec![];
216        }
217
218        // Initialize centroids (k-means++ style)
219        let mut centroids: Vec<Vec<f64>> = Vec::with_capacity(k);
220
221        // First centroid: random point
222        let first = (self.config.seed as usize) % n;
223        centroids.push((0..dim).map(|d| embedding[d][first]).collect());
224
225        // Remaining centroids: proportional to squared distance
226        for _ in 1..k {
227            let mut distances: Vec<f64> = (0..n)
228                .map(|i| {
229                    centroids
230                        .iter()
231                        .map(|c| {
232                            (0..dim)
233                                .map(|d| (embedding[d][i] - c[d]).powi(2))
234                                .sum::<f64>()
235                        })
236                        .fold(f64::INFINITY, f64::min)
237                })
238                .collect();
239
240            let total: f64 = distances.iter().sum();
241            if total > 0.0 {
242                let threshold = (self.config.seed as f64 / 4294967296.0) * total;
243                let mut cumsum = 0.0;
244                let mut chosen = 0;
245                for (i, &d) in distances.iter().enumerate() {
246                    cumsum += d;
247                    if cumsum >= threshold {
248                        chosen = i;
249                        break;
250                    }
251                }
252                centroids.push((0..dim).map(|d| embedding[d][chosen]).collect());
253            } else {
254                // Degenerate case
255                centroids.push(vec![0.0; dim]);
256            }
257        }
258
259        // K-means iterations
260        let mut assignments = vec![0; n];
261
262        for _ in 0..self.config.kmeans_iters {
263            // Assign points to nearest centroid
264            for i in 0..n {
265                let mut best_cluster = 0;
266                let mut best_dist = f64::INFINITY;
267
268                for (c, centroid) in centroids.iter().enumerate() {
269                    let dist: f64 = (0..dim)
270                        .map(|d| (embedding[d][i] - centroid[d]).powi(2))
271                        .sum();
272
273                    if dist < best_dist {
274                        best_dist = dist;
275                        best_cluster = c;
276                    }
277                }
278
279                assignments[i] = best_cluster;
280            }
281
282            // Update centroids
283            let mut counts = vec![0usize; k];
284            for centroid in centroids.iter_mut() {
285                for v in centroid.iter_mut() {
286                    *v = 0.0;
287                }
288            }
289
290            for (i, &c) in assignments.iter().enumerate() {
291                counts[c] += 1;
292                for d in 0..dim {
293                    centroids[c][d] += embedding[d][i];
294                }
295            }
296
297            for (c, centroid) in centroids.iter_mut().enumerate() {
298                if counts[c] > 0 {
299                    for v in centroid.iter_mut() {
300                        *v /= counts[c] as f64;
301                    }
302                }
303            }
304        }
305
306        assignments
307    }
308
309    /// Compute normalized cut value for a bipartition
310    pub fn normalized_cut(&self, laplacian: &ScaledLaplacian, partition: &[bool]) -> f64 {
311        let n = laplacian.n;
312        if n == 0 {
313            return 0.0;
314        }
315
316        // Compute cut and volumes
317        let mut cut = 0.0;
318        let mut vol_a = 0.0;
319        let mut vol_b = 0.0;
320
321        // For each entry in Laplacian
322        for &(i, j, v) in &laplacian.entries {
323            if i < n && j < n && i != j {
324                // This is an edge (negative Laplacian entry)
325                let w = -v; // Edge weight
326                if w > 0.0 && partition[i] != partition[j] {
327                    cut += w;
328                }
329            }
330            if i == j && i < n {
331                // Diagonal = degree
332                if partition[i] {
333                    vol_a += v;
334                } else {
335                    vol_b += v;
336                }
337            }
338        }
339
340        // NCut = cut/vol(A) + cut/vol(B)
341        let ncut = if vol_a > 0.0 { cut / vol_a } else { 0.0 }
342            + if vol_b > 0.0 { cut / vol_b } else { 0.0 };
343
344        ncut
345    }
346}
347
348#[cfg(test)]
349mod tests {
350    use super::*;
351
352    fn two_cliques_graph() -> ScaledLaplacian {
353        // Two cliques of size 3 connected by one edge
354        let edges = vec![
355            // Clique 1
356            (0, 1, 1.0),
357            (0, 2, 1.0),
358            (1, 2, 1.0),
359            // Clique 2
360            (3, 4, 1.0),
361            (3, 5, 1.0),
362            (4, 5, 1.0),
363            // Bridge
364            (2, 3, 0.1),
365        ];
366        ScaledLaplacian::from_sparse_adjacency(&edges, 6)
367    }
368
369    #[test]
370    fn test_spectral_clustering() {
371        let laplacian = two_cliques_graph();
372        let clustering = SpectralClustering::with_k(2);
373
374        let result = clustering.cluster(&laplacian);
375
376        assert_eq!(result.assignments.len(), 6);
377        assert_eq!(result.k, 2);
378
379        // Should roughly separate the two cliques
380        let sizes = result.cluster_sizes();
381        assert_eq!(sizes.iter().sum::<usize>(), 6);
382    }
383
384    #[test]
385    fn test_bipartition() {
386        let laplacian = two_cliques_graph();
387        let clustering = SpectralClustering::with_k(2);
388
389        let result = clustering.bipartition(&laplacian);
390
391        assert_eq!(result.assignments.len(), 6);
392        assert_eq!(result.k, 2);
393    }
394
395    #[test]
396    fn test_cluster_extraction() {
397        let laplacian = two_cliques_graph();
398        let clustering = SpectralClustering::with_k(2);
399        let result = clustering.cluster(&laplacian);
400
401        let c0 = result.cluster(0);
402        let c1 = result.cluster(1);
403
404        // All vertices assigned
405        assert_eq!(c0.len() + c1.len(), 6);
406    }
407
408    #[test]
409    fn test_normalized_cut() {
410        let laplacian = two_cliques_graph();
411        let clustering = SpectralClustering::with_k(2);
412
413        // Good partition: separate cliques
414        let good_partition = vec![true, true, true, false, false, false];
415        let good_ncut = clustering.normalized_cut(&laplacian, &good_partition);
416
417        // Bad partition: mix cliques
418        let bad_partition = vec![true, false, true, false, true, false];
419        let bad_ncut = clustering.normalized_cut(&laplacian, &bad_partition);
420
421        // Good partition should have lower normalized cut
422        // (This is a heuristic test, actual values depend on graph structure)
423        assert!(good_ncut >= 0.0);
424        assert!(bad_ncut >= 0.0);
425    }
426
427    #[test]
428    fn test_single_node() {
429        let laplacian = ScaledLaplacian::from_sparse_adjacency(&[], 1);
430        let clustering = SpectralClustering::with_k(1);
431
432        let result = clustering.cluster(&laplacian);
433
434        assert_eq!(result.assignments.len(), 1);
435        assert_eq!(result.assignments[0], 0);
436    }
437}