37_kmeans_small/37_kmeans_small.rs
1//! # Example: K-Means (small)
2//!
3//! Run: cargo run --example 37_kmeans_small
4//!
5//! ## Problem
6//! Group a handful of 2-D points into `k` clusters by repeatedly assigning each
7//! point to the nearest cluster center and moving each center to the mean of its
8//! points (Lloyd's algorithm).
9//!
10//! ## Math idea
11//! Start from fixed initial centroids (no random seeding, so the run is
12//! deterministic). Each iteration: assign every point to its nearest centroid by
13//! squared Euclidean distance, then recompute each centroid as the mean of its
14//! assigned points. Repeat until assignments stop changing.
15//!
16//! ## Tensor representation
17//! The dataset is a `[points, features]` `Tensor`; each row is one point. Centroids
18//! are kept as small vectors that are recomputed each pass.
19//!
20//! ## What this demonstrates
21//! - using a `Tensor` as a `[samples, features]` data matrix;
22//! - a nearest-centroid assignment via `Tensor::argmin` (RFC-038);
23//! - composing `Tensor` row access with plain Rust arithmetic.
24//!
25//! ## Expected output
26//! ```text
27//! assignments: [0, 0, 0, 1, 1, 1]
28//! centroid 0: [1.5000, 1.5000]
29//! centroid 1: [8.0000, 7.8333]
30//! K-means (small): OK
31//! ```
32//!
33//! This is an algorithm demonstration, not an ML framework: `k`, the initial
34//! centroids, and the iteration count are all fixed and explicit.
35
36use matten::Tensor;
37
38/// Squared Euclidean distance between two equal-length points.
39fn sq_dist(a: &[f64], b: &[f64]) -> f64 {
40 a.iter().zip(b).map(|(x, y)| (x - y) * (x - y)).sum()
41}
42
43/// Index of the nearest centroid to `point`, via `Tensor::argmin` over the
44/// per-centroid distances.
45fn nearest(point: &[f64], centroids: &[Vec<f64>]) -> usize {
46 let dists: Vec<f64> = centroids.iter().map(|c| sq_dist(point, c)).collect();
47 Tensor::from_vec(dists).argmin()
48}
49
50fn main() {
51 // 6 points (rows), 2 features (columns): two obvious clusters.
52 let points = Tensor::new(
53 vec![
54 1.0, 1.0, //
55 1.5, 2.0, //
56 2.0, 1.5, //
57 8.0, 8.0, //
58 8.5, 7.5, //
59 7.5, 8.0, //
60 ],
61 &[6, 2],
62 );
63 let n = points.shape()[0];
64 let dim = points.shape()[1];
65 let data = points.as_slice();
66 let row = |i: usize| &data[i * dim..(i + 1) * dim];
67
68 // Fixed (deterministic) initial centroids — no random seeding.
69 let mut centroids = vec![vec![0.0, 0.0], vec![10.0, 10.0]];
70 let mut assignments = vec![0usize; n];
71
72 for _ in 0..5 {
73 // Assign each point to its nearest centroid.
74 for (i, a) in assignments.iter_mut().enumerate() {
75 *a = nearest(row(i), ¢roids);
76 }
77 // Recompute each centroid as the mean of its assigned points.
78 for (c, centroid) in centroids.iter_mut().enumerate() {
79 let mut sum = vec![0.0; dim];
80 let mut count = 0.0;
81 for (i, &a) in assignments.iter().enumerate() {
82 if a == c {
83 for (d, s) in sum.iter_mut().enumerate() {
84 *s += row(i)[d];
85 }
86 count += 1.0;
87 }
88 }
89 if count > 0.0 {
90 for (d, slot) in centroid.iter_mut().enumerate() {
91 *slot = sum[d] / count;
92 }
93 }
94 }
95 }
96
97 println!("assignments: {assignments:?}");
98 for (c, centroid) in centroids.iter().enumerate() {
99 println!("centroid {c}: [{:.4}, {:.4}]", centroid[0], centroid[1]);
100 }
101
102 assert_eq!(assignments, vec![0, 0, 0, 1, 1, 1]);
103 assert!((centroids[0][0] - 1.5).abs() < 1e-9 && (centroids[0][1] - 1.5).abs() < 1e-9);
104 assert!((centroids[1][0] - 8.0).abs() < 1e-9 && (centroids[1][1] - 23.5 / 3.0).abs() < 1e-9);
105 println!("K-means (small): OK");
106}