use ndarray::{Array1, Array2};
use rand::rngs::StdRng;
use rand::SeedableRng;
use rand_distr::{Distribution, Normal};
fn sample_cluster(
rng: &mut StdRng,
cx: f32,
cy: f32,
std_x: f32,
std_y: f32,
n: usize,
) -> Vec<Vec<f32>> {
let dx = Normal::new(cx as f64, std_x as f64).expect("valid normal");
let dy = Normal::new(cy as f64, std_y as f64).expect("valid normal");
(0..n)
.map(|_| vec![dx.sample(rng) as f32, dy.sample(rng) as f32])
.collect()
}
fn cost_matrix(xs: &[Vec<f32>], ys: &[Vec<f32>]) -> Array2<f32> {
let m = xs.len();
let n = ys.len();
let mut c = Array2::zeros((m, n));
for i in 0..m {
for j in 0..n {
let dx = xs[i][0] - ys[j][0];
let dy = xs[i][1] - ys[j][1];
c[[i, j]] = dx * dx + dy * dy;
}
}
c
}
fn centroid(pts: &[Vec<f32>]) -> [f32; 2] {
let n = pts.len() as f32;
let sx: f32 = pts.iter().map(|p| p[0]).sum();
let sy: f32 = pts.iter().map(|p| p[1]).sum();
[sx / n, sy / n]
}
fn euclidean(a: &[f32; 2], b: &[f32; 2]) -> f32 {
((a[0] - b[0]).powi(2) + (a[1] - b[1]).powi(2)).sqrt()
}
fn cluster_sinkhorn_divergence(xs: &[Vec<f32>], ys: &[Vec<f32>]) -> f32 {
let m = xs.len();
let n = ys.len();
let a = Array1::from_elem(m, 1.0 / m as f32);
let b = Array1::from_elem(n, 1.0 / n as f32);
let cost_ab = cost_matrix(xs, ys);
let cost_aa = cost_matrix(xs, xs);
let cost_bb = cost_matrix(ys, ys);
let reg = 5.0; let max_iter = 2000;
let tol = 5e-2;
wass::sinkhorn_divergence_general(&a, &b, &cost_ab, &cost_aa, &cost_bb, reg, max_iter, tol)
.expect("Sinkhorn divergence converged")
}
fn main() {
let mut rng = StdRng::seed_from_u64(42);
let n = 80;
let cluster_a = sample_cluster(&mut rng, -4.0, 0.0, 0.5, 0.5, n);
let cluster_b = sample_cluster(&mut rng, 4.0, 0.0, 0.5, 0.5, n);
let cluster_c = sample_cluster(&mut rng, 0.0, 4.0, 3.0, 0.3, n);
let all_points: Vec<Vec<f32>> = cluster_a
.iter()
.chain(cluster_b.iter())
.chain(cluster_c.iter())
.cloned()
.collect();
let fit = clump::Kmeans::new(3)
.with_seed(7)
.fit(&all_points)
.expect("k-means converged");
println!("k-means converged in {} iterations", fit.iters);
println!("centroids:");
for (i, c) in fit.centroids.iter().enumerate() {
println!(" cluster {i}: ({:.2}, {:.2})", c[0], c[1]);
}
let mut groups: Vec<Vec<Vec<f32>>> = vec![vec![]; 3];
for (i, &label) in fit.labels.iter().enumerate() {
groups[label].push(all_points[i].clone());
}
let names = ["X", "Y", "Z"];
println!("\n--- Centroid (Euclidean) distances ---");
let centroids: Vec<[f32; 2]> = groups.iter().map(|g| centroid(g)).collect();
for i in 0..3 {
for j in (i + 1)..3 {
let d = euclidean(¢roids[i], ¢roids[j]);
println!(" d({}, {}) = {d:.4}", names[i], names[j]);
}
}
println!("\n--- Sinkhorn divergence (distributional) ---");
for i in 0..3 {
for j in (i + 1)..3 {
let sd = cluster_sinkhorn_divergence(&groups[i], &groups[j]);
println!(" SD({}, {}) = {sd:.4}", names[i], names[j]);
}
}
let mut cd = [0.0f32; 3]; let mut sd = [0.0f32; 3]; let mut idx = 0;
for i in 0..3 {
for j in (i + 1)..3 {
cd[idx] = euclidean(¢roids[i], ¢roids[j]);
sd[idx] = cluster_sinkhorn_divergence(&groups[i], &groups[j]);
idx += 1;
}
}
println!("\n--- Ratio comparison ---");
println!(
" centroid ratio d({},{})/d({},{}) = {:.4}",
names[0],
names[2],
names[1],
names[2],
cd[1] / cd[2]
);
println!(
" Sinkhorn ratio SD({},{})/SD({},{}) = {:.4}",
names[0],
names[2],
names[1],
names[2],
sd[1] / sd[2]
);
println!();
println!("Centroid distance treats each cluster as a point -- ratios reflect");
println!("only center positions. Sinkhorn divergence sees the full distribution,");
println!("so the elongated cluster (stretched along x) registers differently");
println!("relative to each neighbour.");
}