Skip to main content

37_kmeans_small/
37_kmeans_small.rs

1//! # Example: K-Means (small)
2//!
3//! Run: cargo run --example 37_kmeans_small
4//!
5//! ## Problem
6//! Group a handful of 2-D points into `k` clusters by repeatedly assigning each
7//! point to the nearest cluster center and moving each center to the mean of its
8//! points (Lloyd's algorithm).
9//!
10//! ## Math idea
11//! Start from fixed initial centroids (no random seeding, so the run is
12//! deterministic). Each iteration: assign every point to its nearest centroid by
13//! squared Euclidean distance, then recompute each centroid as the mean of its
14//! assigned points. Repeat until assignments stop changing.
15//!
16//! ## Tensor representation
17//! The dataset is a `[points, features]` `Tensor`; each row is one point. Centroids
18//! are kept as small vectors that are recomputed each pass.
19//!
20//! ## What this demonstrates
21//! - using a `Tensor` as a `[samples, features]` data matrix;
22//! - a nearest-centroid assignment via `Tensor::argmin` (RFC-038);
23//! - composing `Tensor` row access with plain Rust arithmetic.
24//!
25//! ## Expected output
26//! ```text
27//! assignments: [0, 0, 0, 1, 1, 1]
28//! centroid 0: [1.5000, 1.5000]
29//! centroid 1: [8.0000, 7.8333]
30//! K-means (small): OK
31//! ```
32//!
33//! This is an algorithm demonstration, not an ML framework: `k`, the initial
34//! centroids, and the iteration count are all fixed and explicit.
35
36use matten::Tensor;
37
38/// Squared Euclidean distance between two equal-length points.
39fn sq_dist(a: &[f64], b: &[f64]) -> f64 {
40    a.iter().zip(b).map(|(x, y)| (x - y) * (x - y)).sum()
41}
42
43/// Index of the nearest centroid to `point`, via `Tensor::argmin` over the
44/// per-centroid distances.
45fn nearest(point: &[f64], centroids: &[Vec<f64>]) -> usize {
46    let dists: Vec<f64> = centroids.iter().map(|c| sq_dist(point, c)).collect();
47    Tensor::from_vec(dists).argmin()
48}
49
50fn main() {
51    // 6 points (rows), 2 features (columns): two obvious clusters.
52    let points = Tensor::new(
53        vec![
54            1.0, 1.0, //
55            1.5, 2.0, //
56            2.0, 1.5, //
57            8.0, 8.0, //
58            8.5, 7.5, //
59            7.5, 8.0, //
60        ],
61        &[6, 2],
62    );
63    let n = points.shape()[0];
64    let dim = points.shape()[1];
65    let data = points.as_slice();
66    let row = |i: usize| &data[i * dim..(i + 1) * dim];
67
68    // Fixed (deterministic) initial centroids — no random seeding.
69    let mut centroids = vec![vec![0.0, 0.0], vec![10.0, 10.0]];
70    let mut assignments = vec![0usize; n];
71
72    for _ in 0..5 {
73        // Assign each point to its nearest centroid.
74        for (i, a) in assignments.iter_mut().enumerate() {
75            *a = nearest(row(i), &centroids);
76        }
77        // Recompute each centroid as the mean of its assigned points.
78        for (c, centroid) in centroids.iter_mut().enumerate() {
79            let mut sum = vec![0.0; dim];
80            let mut count = 0.0;
81            for (i, &a) in assignments.iter().enumerate() {
82                if a == c {
83                    for (d, s) in sum.iter_mut().enumerate() {
84                        *s += row(i)[d];
85                    }
86                    count += 1.0;
87                }
88            }
89            if count > 0.0 {
90                for (d, slot) in centroid.iter_mut().enumerate() {
91                    *slot = sum[d] / count;
92                }
93            }
94        }
95    }
96
97    println!("assignments: {assignments:?}");
98    for (c, centroid) in centroids.iter().enumerate() {
99        println!("centroid {c}: [{:.4}, {:.4}]", centroid[0], centroid[1]);
100    }
101
102    assert_eq!(assignments, vec![0, 0, 0, 1, 1, 1]);
103    assert!((centroids[0][0] - 1.5).abs() < 1e-9 && (centroids[0][1] - 1.5).abs() < 1e-9);
104    assert!((centroids[1][0] - 8.0).abs() < 1e-9 && (centroids[1][1] - 23.5 / 3.0).abs() < 1e-9);
105    println!("K-means (small): OK");
106}