use crate::accumulator::{binned_sum_f64, BinnedAccumulatorF64};
use std::collections::BTreeSet;
#[inline]
fn sq_dist(a: &[f64], b: &[f64]) -> f64 {
debug_assert_eq!(a.len(), b.len());
let diffs: Vec<f64> = a.iter().zip(b.iter()).map(|(&ai, &bi)| {
let d = ai - bi;
d * d
}).collect();
binned_sum_f64(&diffs)
}
pub fn kmeans(
data: &[f64],
n_samples: usize,
n_features: usize,
k: usize,
max_iter: usize,
seed: u64,
) -> (Vec<f64>, Vec<usize>, f64) {
assert_eq!(data.len(), n_samples * n_features, "kmeans: data length mismatch");
assert!(k > 0 && k <= n_samples, "kmeans: k must be in [1, n_samples]");
let point = |i: usize| -> &[f64] {
&data[i * n_features..(i + 1) * n_features]
};
let mut rng = cjc_repro::Rng::seeded(seed);
let mut centroids = Vec::with_capacity(k * n_features);
let first = (rng.next_u64() as usize) % n_samples;
centroids.extend_from_slice(point(first));
for _c in 1..k {
let n_centroids = centroids.len() / n_features;
let mut dist_sq = Vec::with_capacity(n_samples);
for i in 0..n_samples {
let p = point(i);
let mut min_d = f64::INFINITY;
for j in 0..n_centroids {
let c = ¢roids[j * n_features..(j + 1) * n_features];
let d = sq_dist(p, c);
if d < min_d {
min_d = d;
}
}
dist_sq.push(min_d);
}
let total = binned_sum_f64(&dist_sq);
if total == 0.0 {
let idx = (rng.next_u64() as usize) % n_samples;
centroids.extend_from_slice(point(idx));
continue;
}
let threshold = rng.next_f64() * total;
let mut cumulative = 0.0;
let mut chosen = n_samples - 1;
for i in 0..n_samples {
cumulative += dist_sq[i];
if cumulative >= threshold {
chosen = i;
break;
}
}
centroids.extend_from_slice(point(chosen));
}
let mut labels = vec![0usize; n_samples];
for _iter in 0..max_iter {
let old_labels = labels.clone();
for i in 0..n_samples {
let p = point(i);
let mut best_k = 0;
let mut best_d = f64::INFINITY;
for j in 0..k {
let c = ¢roids[j * n_features..(j + 1) * n_features];
let d = sq_dist(p, c);
if d < best_d {
best_d = d;
best_k = j;
}
}
labels[i] = best_k;
}
let mut new_centroids = vec![0.0f64; k * n_features];
let mut counts = vec![0usize; k];
for i in 0..n_samples {
let cluster = labels[i];
counts[cluster] += 1;
let p = point(i);
for f in 0..n_features {
new_centroids[cluster * n_features + f] += p[f];
}
}
for j in 0..k {
if counts[j] == 0 {
continue;
}
for f in 0..n_features {
let mut acc = BinnedAccumulatorF64::new();
for i in 0..n_samples {
if labels[i] == j {
acc.add(data[i * n_features + f]);
}
}
new_centroids[j * n_features + f] = acc.finalize() / counts[j] as f64;
}
}
centroids = new_centroids;
if labels == old_labels {
break;
}
}
let mut inertia_values = Vec::with_capacity(n_samples);
for i in 0..n_samples {
let p = point(i);
let c = ¢roids[labels[i] * n_features..(labels[i] + 1) * n_features];
inertia_values.push(sq_dist(p, c));
}
let inertia = binned_sum_f64(&inertia_values);
(centroids, labels, inertia)
}
pub fn dbscan(
data: &[f64],
n_samples: usize,
n_features: usize,
eps: f64,
min_samples: usize,
) -> Vec<i64> {
assert_eq!(data.len(), n_samples * n_features, "dbscan: data length mismatch");
let eps_sq = eps * eps;
let point = |i: usize| -> &[f64] {
&data[i * n_features..(i + 1) * n_features]
};
let mut neighbors: Vec<Vec<usize>> = Vec::with_capacity(n_samples);
for i in 0..n_samples {
let pi = point(i);
let mut nbrs = Vec::new();
for j in 0..n_samples {
if sq_dist(pi, point(j)) <= eps_sq {
nbrs.push(j);
}
}
neighbors.push(nbrs);
}
let mut labels = vec![-1i64; n_samples];
let mut visited = BTreeSet::new();
let mut cluster_id: i64 = 0;
for i in 0..n_samples {
if visited.contains(&i) {
continue;
}
visited.insert(i);
if neighbors[i].len() < min_samples {
continue;
}
labels[i] = cluster_id;
let mut queue: Vec<usize> = neighbors[i].clone();
let mut qi = 0;
while qi < queue.len() {
let q = queue[qi];
qi += 1;
if !visited.contains(&q) {
visited.insert(q);
if neighbors[q].len() >= min_samples {
for &nb in &neighbors[q] {
if !visited.contains(&nb) || labels[nb] == -1 {
if !queue.contains(&nb) {
queue.push(nb);
}
}
}
}
}
if labels[q] == -1 {
labels[q] = cluster_id;
}
}
cluster_id += 1;
}
labels
}
pub fn agglomerative(
data: &[f64],
n_samples: usize,
n_features: usize,
n_clusters: usize,
linkage: &str,
) -> Vec<usize> {
assert_eq!(data.len(), n_samples * n_features, "agglomerative: data length mismatch");
assert!(n_clusters > 0 && n_clusters <= n_samples, "agglomerative: invalid n_clusters");
let point = |i: usize| -> &[f64] {
&data[i * n_features..(i + 1) * n_features]
};
let n = n_samples;
let condensed_idx = |i: usize, j: usize| -> usize {
debug_assert!(i < j);
i * n - i * (i + 1) / 2 + j - i - 1
};
let condensed_len = n * (n - 1) / 2;
let mut dist = vec![0.0f64; condensed_len];
for i in 0..n {
for j in (i + 1)..n {
dist[condensed_idx(i, j)] = sq_dist(point(i), point(j));
}
}
let mut active = BTreeSet::new();
for i in 0..n {
active.insert(i);
}
let mut sizes = vec![1usize; n];
let mut parent = vec![0usize; 2 * n]; for i in 0..n {
parent[i] = i;
}
let mut next_cluster = n;
let mut cluster_map: Vec<usize> = (0..n).collect();
for _ in 0..(n - n_clusters) {
let mut best_dist = f64::INFINITY;
let mut best_i = 0;
let mut best_j = 0;
let active_vec: Vec<usize> = active.iter().copied().collect();
for ai in 0..active_vec.len() {
for aj in (ai + 1)..active_vec.len() {
let ci = active_vec[ai];
let cj = active_vec[aj];
let (lo, hi) = if ci < cj { (ci, cj) } else { (cj, ci) };
let d = dist[condensed_idx(lo, hi)];
if d < best_dist || (d == best_dist && (lo, hi) < (best_i, best_j)) {
best_dist = d;
best_i = lo;
best_j = hi;
}
}
}
let new_id = next_cluster;
next_cluster += 1;
let size_i = sizes[best_i];
let size_j = sizes[best_j];
let size_new = size_i + size_j;
for s in 0..n {
if cluster_map[s] == best_i || cluster_map[s] == best_j {
cluster_map[s] = new_id;
}
}
active.remove(&best_i);
active.remove(&best_j);
let new_slot = best_i;
sizes.push(0); while sizes.len() <= new_id {
sizes.push(0);
}
sizes[new_id] = size_new;
sizes[new_slot] = size_new;
for &c in &active {
let (lo_i, hi_i) = if c < best_i { (c, best_i) } else { (best_i, c) };
let (lo_j, hi_j) = if c < best_j { (c, best_j) } else { (best_j, c) };
let d_ci = dist[condensed_idx(lo_i, hi_i)];
let d_cj = dist[condensed_idx(lo_j, hi_j)];
let new_dist = match linkage {
"single" => d_ci.min(d_cj),
"complete" => d_ci.max(d_cj),
"average" => {
(size_i as f64 * d_ci + size_j as f64 * d_cj) / size_new as f64
}
"ward" => {
let size_c = sizes[c] as f64;
let sn = size_new as f64;
let si = size_i as f64;
let sj = size_j as f64;
((size_c + si) * d_ci + (size_c + sj) * d_cj - size_c * best_dist)
/ (size_c + sn)
}
_ => panic!("agglomerative: unsupported linkage '{}'. Use single, complete, average, or ward.", linkage),
};
let (lo, hi) = if c < new_slot { (c, new_slot) } else { (new_slot, c) };
dist[condensed_idx(lo, hi)] = new_dist;
}
for s in 0..n {
if cluster_map[s] == new_id {
cluster_map[s] = new_slot;
}
}
active.insert(new_slot);
}
let unique_clusters: BTreeSet<usize> = cluster_map.iter().copied().collect();
let label_map: Vec<(usize, usize)> = unique_clusters.into_iter().enumerate().map(|(i, c)| (c, i)).collect();
let mut labels = vec![0usize; n_samples];
for i in 0..n_samples {
for &(c, l) in &label_map {
if cluster_map[i] == c {
labels[i] = l;
break;
}
}
}
labels
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_kmeans_two_clusters() {
let data = vec![
0.0, 0.0,
0.1, 0.1,
-0.1, 0.1,
0.1, -0.1,
-0.1, -0.1,
10.0, 10.0,
10.1, 10.1,
9.9, 10.1,
10.1, 9.9,
9.9, 9.9,
];
let n_samples = 5 + 5;
let n_features = 2;
let k = 2;
let (centroids, labels, inertia) = kmeans(&data, n_samples, n_features, k, 100, 42);
let unique: BTreeSet<usize> = labels.iter().copied().collect();
assert_eq!(unique.len(), 2);
assert!(labels[0..5].iter().all(|&l| l == labels[0]));
assert!(labels[5..10].iter().all(|&l| l == labels[5]));
assert_ne!(labels[0], labels[5]);
assert_eq!(centroids.len(), k * n_features);
assert!(inertia < 1.0, "inertia too large: {}", inertia);
}
#[test]
fn test_kmeans_deterministic() {
let data = vec![
1.0, 2.0,
3.0, 4.0,
5.0, 6.0,
7.0, 8.0,
9.0, 10.0,
11.0, 12.0,
];
let (c1, l1, i1) = kmeans(&data, 6, 2, 2, 50, 123);
let (c2, l2, i2) = kmeans(&data, 6, 2, 2, 50, 123);
assert_eq!(l1, l2);
assert_eq!(c1, c2);
assert_eq!(i1.to_bits(), i2.to_bits());
}
#[test]
fn test_kmeans_single_cluster() {
let data = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0];
let (_centroids, labels, _inertia) = kmeans(&data, 3, 2, 1, 10, 0);
assert!(labels.iter().all(|&l| l == 0));
}
#[test]
fn test_dbscan_two_clusters_with_noise() {
let data = vec![
0.0, 0.0,
0.1, 0.0,
0.0, 0.1,
0.1, 0.1,
10.0, 10.0,
10.1, 10.0,
10.0, 10.1,
10.1, 10.1,
50.0, 50.0, ];
let n_samples = 9;
let n_features = 2;
let labels = dbscan(&data, n_samples, n_features, 0.5, 2);
assert!(labels[0] >= 0);
assert!(labels[0..4].iter().all(|&l| l == labels[0]));
assert!(labels[4] >= 0);
assert!(labels[4..8].iter().all(|&l| l == labels[4]));
assert_ne!(labels[0], labels[4]);
assert_eq!(labels[8], -1);
}
#[test]
fn test_dbscan_all_noise() {
let data = vec![0.0, 0.0, 100.0, 100.0, 200.0, 200.0];
let labels = dbscan(&data, 3, 2, 0.5, 2);
assert!(labels.iter().all(|&l| l == -1));
}
#[test]
fn test_dbscan_single_cluster() {
let data = vec![0.0, 0.0, 0.1, 0.0, 0.0, 0.1, 0.1, 0.1];
let labels = dbscan(&data, 4, 2, 0.5, 2);
assert!(labels.iter().all(|&l| l == labels[0] && l >= 0));
}
#[test]
fn test_agglomerative_two_clusters_single() {
let data = vec![
0.0, 0.0,
0.1, 0.1,
0.2, 0.0,
10.0, 10.0,
10.1, 10.1,
10.2, 10.0,
];
let labels = agglomerative(&data, 6, 2, 2, "single");
assert!(labels[0..3].iter().all(|&l| l == labels[0]));
assert!(labels[3..6].iter().all(|&l| l == labels[3]));
assert_ne!(labels[0], labels[3]);
}
#[test]
fn test_agglomerative_complete_linkage() {
let data = vec![
0.0, 0.0,
0.1, 0.1,
10.0, 10.0,
10.1, 10.1,
];
let labels = agglomerative(&data, 4, 2, 2, "complete");
assert_eq!(labels[0], labels[1]);
assert_eq!(labels[2], labels[3]);
assert_ne!(labels[0], labels[2]);
}
#[test]
fn test_agglomerative_average_linkage() {
let data = vec![
0.0, 0.0,
1.0, 0.0,
20.0, 0.0,
21.0, 0.0,
];
let labels = agglomerative(&data, 4, 2, 2, "average");
assert_eq!(labels[0], labels[1]);
assert_eq!(labels[2], labels[3]);
assert_ne!(labels[0], labels[2]);
}
#[test]
fn test_agglomerative_ward_linkage() {
let data = vec![
0.0, 0.0,
0.5, 0.0,
10.0, 10.0,
10.5, 10.0,
];
let labels = agglomerative(&data, 4, 2, 2, "ward");
assert_eq!(labels[0], labels[1]);
assert_eq!(labels[2], labels[3]);
assert_ne!(labels[0], labels[2]);
}
#[test]
fn test_agglomerative_n_clusters_equals_n() {
let data = vec![0.0, 1.0, 2.0, 3.0];
let labels = agglomerative(&data, 4, 1, 4, "single");
let unique: BTreeSet<usize> = labels.iter().copied().collect();
assert_eq!(unique.len(), 4);
}
#[test]
fn test_agglomerative_single_cluster() {
let data = vec![0.0, 1.0, 2.0, 3.0];
let labels = agglomerative(&data, 4, 1, 1, "single");
assert!(labels.iter().all(|&l| l == 0));
}
#[test]
fn test_kmeans_different_seeds_may_differ() {
let data = vec![
0.0, 0.0, 1.0, 1.0, 2.0, 2.0,
10.0, 10.0, 11.0, 11.0, 12.0, 12.0,
];
let (_, l1, _) = kmeans(&data, 6, 2, 2, 50, 1);
let (_, l2, _) = kmeans(&data, 6, 2, 2, 50, 2);
let u1: BTreeSet<usize> = l1.iter().copied().collect();
let u2: BTreeSet<usize> = l2.iter().copied().collect();
assert_eq!(u1.len(), 2);
assert_eq!(u2.len(), 2);
}
#[test]
fn test_dbscan_deterministic() {
let data = vec![
0.0, 0.0, 0.1, 0.0, 0.0, 0.1,
10.0, 10.0, 10.1, 10.0, 10.0, 10.1,
];
let l1 = dbscan(&data, 6, 2, 0.5, 2);
let l2 = dbscan(&data, 6, 2, 0.5, 2);
assert_eq!(l1, l2);
}
#[test]
fn test_agglomerative_deterministic() {
let data = vec![
0.0, 0.0, 0.1, 0.1, 10.0, 10.0, 10.1, 10.1,
];
let l1 = agglomerative(&data, 4, 2, 2, "ward");
let l2 = agglomerative(&data, 4, 2, 2, "ward");
assert_eq!(l1, l2);
}
}