ruvector-math 2.0.6

Advanced mathematics for next-gen vector search: Optimal Transport, Information Geometry, Product Manifolds
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
//! Spectral Clustering
//!
//! Graph partitioning using spectral methods.
//! Efficient approximation via Chebyshev polynomials.

use super::ScaledLaplacian;

/// Spectral clustering configuration
#[derive(Debug, Clone)]
pub struct ClusteringConfig {
    /// Number of clusters
    pub k: usize,
    /// Number of eigenvectors to use
    pub num_eigenvectors: usize,
    /// Power iteration steps for eigenvector approximation
    pub power_iters: usize,
    /// K-means iterations
    pub kmeans_iters: usize,
    /// Random seed
    pub seed: u64,
}

impl Default for ClusteringConfig {
    fn default() -> Self {
        Self {
            k: 2,
            num_eigenvectors: 10,
            power_iters: 50,
            kmeans_iters: 20,
            seed: 42,
        }
    }
}

/// Spectral clustering result
#[derive(Debug, Clone)]
pub struct ClusteringResult {
    /// Cluster assignment for each vertex
    pub assignments: Vec<usize>,
    /// Eigenvector embedding (n × k)
    pub embedding: Vec<Vec<f64>>,
    /// Number of clusters
    pub k: usize,
}

impl ClusteringResult {
    /// Get vertices in cluster c
    pub fn cluster(&self, c: usize) -> Vec<usize> {
        self.assignments
            .iter()
            .enumerate()
            .filter(|(_, &a)| a == c)
            .map(|(i, _)| i)
            .collect()
    }

    /// Cluster sizes
    pub fn cluster_sizes(&self) -> Vec<usize> {
        let mut sizes = vec![0; self.k];
        for &a in &self.assignments {
            if a < self.k {
                sizes[a] += 1;
            }
        }
        sizes
    }
}

/// Spectral clustering
#[derive(Debug, Clone)]
pub struct SpectralClustering {
    /// Configuration
    config: ClusteringConfig,
}

impl SpectralClustering {
    /// Create with configuration
    pub fn new(config: ClusteringConfig) -> Self {
        Self { config }
    }

    /// Create with just number of clusters
    pub fn with_k(k: usize) -> Self {
        Self::new(ClusteringConfig {
            k,
            num_eigenvectors: k,
            ..Default::default()
        })
    }

    /// Cluster graph using normalized Laplacian eigenvectors
    pub fn cluster(&self, laplacian: &ScaledLaplacian) -> ClusteringResult {
        let n = laplacian.n;
        let k = self.config.k.min(n);
        let num_eig = self.config.num_eigenvectors.min(n);

        // Compute approximate eigenvectors of Laplacian
        // We want the k smallest eigenvalues (smoothest eigenvectors)
        // Use inverse power method on shifted Laplacian
        let embedding = self.compute_embedding(laplacian, num_eig);

        // Run k-means on embedding
        let assignments = self.kmeans(&embedding, k);

        ClusteringResult {
            assignments,
            embedding,
            k,
        }
    }

    /// Cluster using Fiedler vector (k=2)
    pub fn bipartition(&self, laplacian: &ScaledLaplacian) -> ClusteringResult {
        let n = laplacian.n;

        // Compute Fiedler vector (second smallest eigenvector)
        let fiedler = self.compute_fiedler(laplacian);

        // Partition by sign
        let assignments: Vec<usize> = fiedler
            .iter()
            .map(|&v| if v >= 0.0 { 0 } else { 1 })
            .collect();

        ClusteringResult {
            assignments,
            embedding: vec![fiedler],
            k: 2,
        }
    }

    /// Compute spectral embedding (k smallest non-trivial eigenvectors)
    fn compute_embedding(&self, laplacian: &ScaledLaplacian, k: usize) -> Vec<Vec<f64>> {
        let n = laplacian.n;
        if k == 0 || n == 0 {
            return vec![];
        }

        // Initialize random vectors
        let mut vectors: Vec<Vec<f64>> = (0..k)
            .map(|i| {
                (0..n)
                    .map(|j| {
                        let x = ((j * 2654435769 + i * 1103515245 + self.config.seed as usize)
                            as f64
                            / 4294967296.0)
                            * 2.0
                            - 1.0;
                        x
                    })
                    .collect()
            })
            .collect();

        // Power iteration to find smallest eigenvectors
        // We use (I - L_scaled) which has largest eigenvalue where L_scaled has smallest
        for _ in 0..self.config.power_iters {
            for i in 0..k {
                // Apply (I - L_scaled) = (2I - L)/λ_max approximately
                // Simpler: just use deflated power iteration on L for smallest
                let mut y = vec![0.0; n];
                let lx = laplacian.apply(&vectors[i]);

                // We want small eigenvalues, so use (λ_max*I - L)
                let shift = 2.0; // Approximate max eigenvalue of scaled Laplacian
                for j in 0..n {
                    y[j] = shift * vectors[i][j] - lx[j];
                }

                // Orthogonalize against previous vectors and constant vector
                // First, remove constant component (eigenvalue 0)
                let mean: f64 = y.iter().sum::<f64>() / n as f64;
                for j in 0..n {
                    y[j] -= mean;
                }

                // Then orthogonalize against previous eigenvectors
                for prev in 0..i {
                    let dot: f64 = y.iter().zip(vectors[prev].iter()).map(|(a, b)| a * b).sum();
                    for j in 0..n {
                        y[j] -= dot * vectors[prev][j];
                    }
                }

                // Normalize
                let norm: f64 = y.iter().map(|x| x * x).sum::<f64>().sqrt();
                if norm > 1e-15 {
                    for j in 0..n {
                        y[j] /= norm;
                    }
                }

                vectors[i] = y;
            }
        }

        vectors
    }

    /// Compute Fiedler vector (second smallest eigenvector)
    fn compute_fiedler(&self, laplacian: &ScaledLaplacian) -> Vec<f64> {
        let embedding = self.compute_embedding(laplacian, 1);
        if embedding.is_empty() {
            return vec![0.0; laplacian.n];
        }
        embedding[0].clone()
    }

    /// K-means clustering on embedding
    fn kmeans(&self, embedding: &[Vec<f64>], k: usize) -> Vec<usize> {
        if embedding.is_empty() {
            return vec![];
        }

        let n = embedding[0].len();
        let dim = embedding.len();

        if n == 0 || k == 0 {
            return vec![];
        }

        // Initialize centroids (k-means++ style)
        let mut centroids: Vec<Vec<f64>> = Vec::with_capacity(k);

        // First centroid: random point
        let first = (self.config.seed as usize) % n;
        centroids.push((0..dim).map(|d| embedding[d][first]).collect());

        // Remaining centroids: proportional to squared distance
        for _ in 1..k {
            let mut distances: Vec<f64> = (0..n)
                .map(|i| {
                    centroids
                        .iter()
                        .map(|c| {
                            (0..dim)
                                .map(|d| (embedding[d][i] - c[d]).powi(2))
                                .sum::<f64>()
                        })
                        .fold(f64::INFINITY, f64::min)
                })
                .collect();

            let total: f64 = distances.iter().sum();
            if total > 0.0 {
                let threshold = (self.config.seed as f64 / 4294967296.0) * total;
                let mut cumsum = 0.0;
                let mut chosen = 0;
                for (i, &d) in distances.iter().enumerate() {
                    cumsum += d;
                    if cumsum >= threshold {
                        chosen = i;
                        break;
                    }
                }
                centroids.push((0..dim).map(|d| embedding[d][chosen]).collect());
            } else {
                // Degenerate case
                centroids.push(vec![0.0; dim]);
            }
        }

        // K-means iterations
        let mut assignments = vec![0; n];

        for _ in 0..self.config.kmeans_iters {
            // Assign points to nearest centroid
            for i in 0..n {
                let mut best_cluster = 0;
                let mut best_dist = f64::INFINITY;

                for (c, centroid) in centroids.iter().enumerate() {
                    let dist: f64 = (0..dim)
                        .map(|d| (embedding[d][i] - centroid[d]).powi(2))
                        .sum();

                    if dist < best_dist {
                        best_dist = dist;
                        best_cluster = c;
                    }
                }

                assignments[i] = best_cluster;
            }

            // Update centroids
            let mut counts = vec![0usize; k];
            for centroid in centroids.iter_mut() {
                for v in centroid.iter_mut() {
                    *v = 0.0;
                }
            }

            for (i, &c) in assignments.iter().enumerate() {
                counts[c] += 1;
                for d in 0..dim {
                    centroids[c][d] += embedding[d][i];
                }
            }

            for (c, centroid) in centroids.iter_mut().enumerate() {
                if counts[c] > 0 {
                    for v in centroid.iter_mut() {
                        *v /= counts[c] as f64;
                    }
                }
            }
        }

        assignments
    }

    /// Compute normalized cut value for a bipartition
    pub fn normalized_cut(&self, laplacian: &ScaledLaplacian, partition: &[bool]) -> f64 {
        let n = laplacian.n;
        if n == 0 {
            return 0.0;
        }

        // Compute cut and volumes
        let mut cut = 0.0;
        let mut vol_a = 0.0;
        let mut vol_b = 0.0;

        // For each entry in Laplacian
        for &(i, j, v) in &laplacian.entries {
            if i < n && j < n && i != j {
                // This is an edge (negative Laplacian entry)
                let w = -v; // Edge weight
                if w > 0.0 && partition[i] != partition[j] {
                    cut += w;
                }
            }
            if i == j && i < n {
                // Diagonal = degree
                if partition[i] {
                    vol_a += v;
                } else {
                    vol_b += v;
                }
            }
        }

        // NCut = cut/vol(A) + cut/vol(B)
        let ncut = if vol_a > 0.0 { cut / vol_a } else { 0.0 }
            + if vol_b > 0.0 { cut / vol_b } else { 0.0 };

        ncut
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    fn two_cliques_graph() -> ScaledLaplacian {
        // Two cliques of size 3 connected by one edge
        let edges = vec![
            // Clique 1
            (0, 1, 1.0),
            (0, 2, 1.0),
            (1, 2, 1.0),
            // Clique 2
            (3, 4, 1.0),
            (3, 5, 1.0),
            (4, 5, 1.0),
            // Bridge
            (2, 3, 0.1),
        ];
        ScaledLaplacian::from_sparse_adjacency(&edges, 6)
    }

    #[test]
    fn test_spectral_clustering() {
        let laplacian = two_cliques_graph();
        let clustering = SpectralClustering::with_k(2);

        let result = clustering.cluster(&laplacian);

        assert_eq!(result.assignments.len(), 6);
        assert_eq!(result.k, 2);

        // Should roughly separate the two cliques
        let sizes = result.cluster_sizes();
        assert_eq!(sizes.iter().sum::<usize>(), 6);
    }

    #[test]
    fn test_bipartition() {
        let laplacian = two_cliques_graph();
        let clustering = SpectralClustering::with_k(2);

        let result = clustering.bipartition(&laplacian);

        assert_eq!(result.assignments.len(), 6);
        assert_eq!(result.k, 2);
    }

    #[test]
    fn test_cluster_extraction() {
        let laplacian = two_cliques_graph();
        let clustering = SpectralClustering::with_k(2);
        let result = clustering.cluster(&laplacian);

        let c0 = result.cluster(0);
        let c1 = result.cluster(1);

        // All vertices assigned
        assert_eq!(c0.len() + c1.len(), 6);
    }

    #[test]
    fn test_normalized_cut() {
        let laplacian = two_cliques_graph();
        let clustering = SpectralClustering::with_k(2);

        // Good partition: separate cliques
        let good_partition = vec![true, true, true, false, false, false];
        let good_ncut = clustering.normalized_cut(&laplacian, &good_partition);

        // Bad partition: mix cliques
        let bad_partition = vec![true, false, true, false, true, false];
        let bad_ncut = clustering.normalized_cut(&laplacian, &bad_partition);

        // Good partition should have lower normalized cut
        // (This is a heuristic test, actual values depend on graph structure)
        assert!(good_ncut >= 0.0);
        assert!(bad_ncut >= 0.0);
    }

    #[test]
    fn test_single_node() {
        let laplacian = ScaledLaplacian::from_sparse_adjacency(&[], 1);
        let clustering = SpectralClustering::with_k(1);

        let result = clustering.cluster(&laplacian);

        assert_eq!(result.assignments.len(), 1);
        assert_eq!(result.assignments[0], 0);
    }
}