Skip to main content

ruvector_math/spectral/
clustering.rs

1//! Spectral Clustering
2//!
3//! Graph partitioning using spectral methods.
4//! Efficient approximation via Chebyshev polynomials.
5
6use super::ScaledLaplacian;
7
8/// Spectral clustering configuration
9#[derive(Debug, Clone)]
10pub struct ClusteringConfig {
11    /// Number of clusters
12    pub k: usize,
13    /// Number of eigenvectors to use
14    pub num_eigenvectors: usize,
15    /// Power iteration steps for eigenvector approximation
16    pub power_iters: usize,
17    /// K-means iterations
18    pub kmeans_iters: usize,
19    /// Random seed
20    pub seed: u64,
21}
22
23impl Default for ClusteringConfig {
24    fn default() -> Self {
25        Self {
26            k: 2,
27            num_eigenvectors: 10,
28            power_iters: 50,
29            kmeans_iters: 20,
30            seed: 42,
31        }
32    }
33}
34
35/// Spectral clustering result
36#[derive(Debug, Clone)]
37pub struct ClusteringResult {
38    /// Cluster assignment for each vertex
39    pub assignments: Vec<usize>,
40    /// Eigenvector embedding (n × k)
41    pub embedding: Vec<Vec<f64>>,
42    /// Number of clusters
43    pub k: usize,
44}
45
46impl ClusteringResult {
47    /// Get vertices in cluster c
48    pub fn cluster(&self, c: usize) -> Vec<usize> {
49        self.assignments
50            .iter()
51            .enumerate()
52            .filter(|(_, &a)| a == c)
53            .map(|(i, _)| i)
54            .collect()
55    }
56
57    /// Cluster sizes
58    pub fn cluster_sizes(&self) -> Vec<usize> {
59        let mut sizes = vec![0; self.k];
60        for &a in &self.assignments {
61            if a < self.k {
62                sizes[a] += 1;
63            }
64        }
65        sizes
66    }
67}
68
69/// Spectral clustering
70#[derive(Debug, Clone)]
71pub struct SpectralClustering {
72    /// Configuration
73    config: ClusteringConfig,
74}
75
76impl SpectralClustering {
77    /// Create with configuration
78    pub fn new(config: ClusteringConfig) -> Self {
79        Self { config }
80    }
81
82    /// Create with just number of clusters
83    pub fn with_k(k: usize) -> Self {
84        Self::new(ClusteringConfig {
85            k,
86            num_eigenvectors: k,
87            ..Default::default()
88        })
89    }
90
91    /// Cluster graph using normalized Laplacian eigenvectors
92    pub fn cluster(&self, laplacian: &ScaledLaplacian) -> ClusteringResult {
93        let n = laplacian.n;
94        let k = self.config.k.min(n);
95        let num_eig = self.config.num_eigenvectors.min(n);
96
97        // Compute approximate eigenvectors of Laplacian
98        // We want the k smallest eigenvalues (smoothest eigenvectors)
99        // Use inverse power method on shifted Laplacian
100        let embedding = self.compute_embedding(laplacian, num_eig);
101
102        // Run k-means on embedding
103        let assignments = self.kmeans(&embedding, k);
104
105        ClusteringResult {
106            assignments,
107            embedding,
108            k,
109        }
110    }
111
112    /// Cluster using Fiedler vector (k=2)
113    pub fn bipartition(&self, laplacian: &ScaledLaplacian) -> ClusteringResult {
114        let n = laplacian.n;
115
116        // Compute Fiedler vector (second smallest eigenvector)
117        let fiedler = self.compute_fiedler(laplacian);
118
119        // Partition by sign
120        let assignments: Vec<usize> = fiedler
121            .iter()
122            .map(|&v| if v >= 0.0 { 0 } else { 1 })
123            .collect();
124
125        ClusteringResult {
126            assignments,
127            embedding: vec![fiedler],
128            k: 2,
129        }
130    }
131
132    /// Compute spectral embedding (k smallest non-trivial eigenvectors)
133    fn compute_embedding(&self, laplacian: &ScaledLaplacian, k: usize) -> Vec<Vec<f64>> {
134        let n = laplacian.n;
135        if k == 0 || n == 0 {
136            return vec![];
137        }
138
139        // Initialize random vectors
140        let mut vectors: Vec<Vec<f64>> = (0..k)
141            .map(|i| {
142                (0..n)
143                    .map(|j| {
144                        let x = ((j * 2654435769 + i * 1103515245 + self.config.seed as usize)
145                            as f64
146                            / 4294967296.0)
147                            * 2.0
148                            - 1.0;
149                        x
150                    })
151                    .collect()
152            })
153            .collect();
154
155        // Power iteration to find smallest eigenvectors
156        // We use (I - L_scaled) which has largest eigenvalue where L_scaled has smallest
157        for _ in 0..self.config.power_iters {
158            for i in 0..k {
159                // Apply (I - L_scaled) = (2I - L)/λ_max approximately
160                // Simpler: just use deflated power iteration on L for smallest
161                let mut y = vec![0.0; n];
162                let lx = laplacian.apply(&vectors[i]);
163
164                // We want small eigenvalues, so use (λ_max*I - L)
165                let shift = 2.0; // Approximate max eigenvalue of scaled Laplacian
166                for j in 0..n {
167                    y[j] = shift * vectors[i][j] - lx[j];
168                }
169
170                // Orthogonalize against previous vectors and constant vector
171                // First, remove constant component (eigenvalue 0)
172                let mean: f64 = y.iter().sum::<f64>() / n as f64;
173                for j in 0..n {
174                    y[j] -= mean;
175                }
176
177                // Then orthogonalize against previous eigenvectors
178                for prev in 0..i {
179                    let dot: f64 = y.iter().zip(vectors[prev].iter()).map(|(a, b)| a * b).sum();
180                    for j in 0..n {
181                        y[j] -= dot * vectors[prev][j];
182                    }
183                }
184
185                // Normalize
186                let norm: f64 = y.iter().map(|x| x * x).sum::<f64>().sqrt();
187                if norm > 1e-15 {
188                    for j in 0..n {
189                        y[j] /= norm;
190                    }
191                }
192
193                vectors[i] = y;
194            }
195        }
196
197        vectors
198    }
199
200    /// Compute Fiedler vector (second smallest eigenvector)
201    fn compute_fiedler(&self, laplacian: &ScaledLaplacian) -> Vec<f64> {
202        let embedding = self.compute_embedding(laplacian, 1);
203        if embedding.is_empty() {
204            return vec![0.0; laplacian.n];
205        }
206        embedding[0].clone()
207    }
208
209    /// K-means clustering on embedding
210    fn kmeans(&self, embedding: &[Vec<f64>], k: usize) -> Vec<usize> {
211        if embedding.is_empty() {
212            return vec![];
213        }
214
215        let n = embedding[0].len();
216        let dim = embedding.len();
217
218        if n == 0 || k == 0 {
219            return vec![];
220        }
221
222        // Initialize centroids (k-means++ style)
223        let mut centroids: Vec<Vec<f64>> = Vec::with_capacity(k);
224
225        // First centroid: random point
226        let first = (self.config.seed as usize) % n;
227        centroids.push((0..dim).map(|d| embedding[d][first]).collect());
228
229        // Remaining centroids: proportional to squared distance
230        for _ in 1..k {
231            let mut distances: Vec<f64> = (0..n)
232                .map(|i| {
233                    centroids
234                        .iter()
235                        .map(|c| {
236                            (0..dim)
237                                .map(|d| (embedding[d][i] - c[d]).powi(2))
238                                .sum::<f64>()
239                        })
240                        .fold(f64::INFINITY, f64::min)
241                })
242                .collect();
243
244            let total: f64 = distances.iter().sum();
245            if total > 0.0 {
246                let threshold = (self.config.seed as f64 / 4294967296.0) * total;
247                let mut cumsum = 0.0;
248                let mut chosen = 0;
249                for (i, &d) in distances.iter().enumerate() {
250                    cumsum += d;
251                    if cumsum >= threshold {
252                        chosen = i;
253                        break;
254                    }
255                }
256                centroids.push((0..dim).map(|d| embedding[d][chosen]).collect());
257            } else {
258                // Degenerate case
259                centroids.push(vec![0.0; dim]);
260            }
261        }
262
263        // K-means iterations
264        let mut assignments = vec![0; n];
265
266        for _ in 0..self.config.kmeans_iters {
267            // Assign points to nearest centroid
268            for i in 0..n {
269                let mut best_cluster = 0;
270                let mut best_dist = f64::INFINITY;
271
272                for (c, centroid) in centroids.iter().enumerate() {
273                    let dist: f64 = (0..dim)
274                        .map(|d| (embedding[d][i] - centroid[d]).powi(2))
275                        .sum();
276
277                    if dist < best_dist {
278                        best_dist = dist;
279                        best_cluster = c;
280                    }
281                }
282
283                assignments[i] = best_cluster;
284            }
285
286            // Update centroids
287            let mut counts = vec![0usize; k];
288            for centroid in centroids.iter_mut() {
289                for v in centroid.iter_mut() {
290                    *v = 0.0;
291                }
292            }
293
294            for (i, &c) in assignments.iter().enumerate() {
295                counts[c] += 1;
296                for d in 0..dim {
297                    centroids[c][d] += embedding[d][i];
298                }
299            }
300
301            for (c, centroid) in centroids.iter_mut().enumerate() {
302                if counts[c] > 0 {
303                    for v in centroid.iter_mut() {
304                        *v /= counts[c] as f64;
305                    }
306                }
307            }
308        }
309
310        assignments
311    }
312
313    /// Compute normalized cut value for a bipartition
314    pub fn normalized_cut(&self, laplacian: &ScaledLaplacian, partition: &[bool]) -> f64 {
315        let n = laplacian.n;
316        if n == 0 {
317            return 0.0;
318        }
319
320        // Compute cut and volumes
321        let mut cut = 0.0;
322        let mut vol_a = 0.0;
323        let mut vol_b = 0.0;
324
325        // For each entry in Laplacian
326        for &(i, j, v) in &laplacian.entries {
327            if i < n && j < n && i != j {
328                // This is an edge (negative Laplacian entry)
329                let w = -v; // Edge weight
330                if w > 0.0 && partition[i] != partition[j] {
331                    cut += w;
332                }
333            }
334            if i == j && i < n {
335                // Diagonal = degree
336                if partition[i] {
337                    vol_a += v;
338                } else {
339                    vol_b += v;
340                }
341            }
342        }
343
344        // NCut = cut/vol(A) + cut/vol(B)
345        let ncut = if vol_a > 0.0 { cut / vol_a } else { 0.0 }
346            + if vol_b > 0.0 { cut / vol_b } else { 0.0 };
347
348        ncut
349    }
350}
351
352#[cfg(test)]
353mod tests {
354    use super::*;
355
356    fn two_cliques_graph() -> ScaledLaplacian {
357        // Two cliques of size 3 connected by one edge
358        let edges = vec![
359            // Clique 1
360            (0, 1, 1.0),
361            (0, 2, 1.0),
362            (1, 2, 1.0),
363            // Clique 2
364            (3, 4, 1.0),
365            (3, 5, 1.0),
366            (4, 5, 1.0),
367            // Bridge
368            (2, 3, 0.1),
369        ];
370        ScaledLaplacian::from_sparse_adjacency(&edges, 6)
371    }
372
373    #[test]
374    fn test_spectral_clustering() {
375        let laplacian = two_cliques_graph();
376        let clustering = SpectralClustering::with_k(2);
377
378        let result = clustering.cluster(&laplacian);
379
380        assert_eq!(result.assignments.len(), 6);
381        assert_eq!(result.k, 2);
382
383        // Should roughly separate the two cliques
384        let sizes = result.cluster_sizes();
385        assert_eq!(sizes.iter().sum::<usize>(), 6);
386    }
387
388    #[test]
389    fn test_bipartition() {
390        let laplacian = two_cliques_graph();
391        let clustering = SpectralClustering::with_k(2);
392
393        let result = clustering.bipartition(&laplacian);
394
395        assert_eq!(result.assignments.len(), 6);
396        assert_eq!(result.k, 2);
397    }
398
399    #[test]
400    fn test_cluster_extraction() {
401        let laplacian = two_cliques_graph();
402        let clustering = SpectralClustering::with_k(2);
403        let result = clustering.cluster(&laplacian);
404
405        let c0 = result.cluster(0);
406        let c1 = result.cluster(1);
407
408        // All vertices assigned
409        assert_eq!(c0.len() + c1.len(), 6);
410    }
411
412    #[test]
413    fn test_normalized_cut() {
414        let laplacian = two_cliques_graph();
415        let clustering = SpectralClustering::with_k(2);
416
417        // Good partition: separate cliques
418        let good_partition = vec![true, true, true, false, false, false];
419        let good_ncut = clustering.normalized_cut(&laplacian, &good_partition);
420
421        // Bad partition: mix cliques
422        let bad_partition = vec![true, false, true, false, true, false];
423        let bad_ncut = clustering.normalized_cut(&laplacian, &bad_partition);
424
425        // Good partition should have lower normalized cut
426        // (This is a heuristic test, actual values depend on graph structure)
427        assert!(good_ncut >= 0.0);
428        assert!(bad_ncut >= 0.0);
429    }
430
431    #[test]
432    fn test_single_node() {
433        let laplacian = ScaledLaplacian::from_sparse_adjacency(&[], 1);
434        let clustering = SpectralClustering::with_k(1);
435
436        let result = clustering.cluster(&laplacian);
437
438        assert_eq!(result.assignments.len(), 1);
439        assert_eq!(result.assignments[0], 0);
440    }
441}