use kategorize::{KModes, InitMethod};
use ndarray::Array2;
use std::collections::HashMap;
use std::time::Instant;
fn main() -> Result<(), Box<dyn std::error::Error>> {
let data = generate_synthetic_data(1000, 8, 15);
println!("Generated synthetic data: {} samples, {} features", data.nrows(), data.ncols());
println!();
println!("=== Example 1: Finding optimal number of clusters ===");
let start = Instant::now();
let k_range = 2..=10;
let mut inertias = Vec::new();
for k in k_range.clone() {
let kmodes = KModes::new(k)
.init_method(InitMethod::Huang)
.random_state(42)
.n_init(3)
.max_iter(100);
let result = kmodes.fit(data.view())?;
inertias.push((k, result.inertia));
println!("k={:2}: inertia={:8.2}, converged={}", k, result.inertia, result.converged);
}
let mut best_k = 2;
let mut max_decrease = 0.0;
for i in 1..inertias.len() {
let decrease = inertias[i-1].1 - inertias[i].1;
let decrease_rate = decrease / inertias[i-1].1;
if decrease_rate > max_decrease {
max_decrease = decrease_rate;
best_k = inertias[i].0;
}
}
println!("Suggested optimal k: {} (largest relative inertia decrease: {:.1}%)",
best_k, max_decrease * 100.0);
println!("Time taken: {:.2}s", start.elapsed().as_secs_f64());
println!();
println!("=== Example 2: Initialization strategy comparison ===");
let test_data = data.slice(ndarray::s![0..200, ..]).to_owned();
let strategies = [
("Random", InitMethod::Random),
("Huang", InitMethod::Huang),
("Cao", InitMethod::Cao),
];
for (name, method) in &strategies {
let mut results = Vec::new();
let start = Instant::now();
for seed in 0..5 {
let kmodes = KModes::new(best_k)
.init_method(*method)
.random_state(seed)
.n_init(1)
.max_iter(100);
let result = kmodes.fit(test_data.view())?;
results.push(result.inertia);
}
let mean_inertia = results.iter().sum::<f64>() / results.len() as f64;
let min_inertia = results.iter().fold(f64::INFINITY, |a, &b| a.min(b));
let max_inertia = results.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
let std_dev = {
let variance = results.iter()
.map(|&x| (x - mean_inertia).powi(2))
.sum::<f64>() / results.len() as f64;
variance.sqrt()
};
println!("{:8} - Mean: {:8.2}, Min: {:8.2}, Max: {:8.2}, Std: {:6.2}, Time: {:5.3}s",
name, mean_inertia, min_inertia, max_inertia, std_dev, start.elapsed().as_secs_f64());
}
println!();
println!("=== Example 3: Performance optimization ===");
let large_data = data.slice(ndarray::s![0..500, ..]).to_owned();
let n_init_values = [1, 3, 5, 10];
for &n_init in &n_init_values {
let start = Instant::now();
let kmodes = KModes::new(best_k)
.init_method(InitMethod::Huang)
.random_state(42)
.n_init(n_init)
.max_iter(50)
;
let result = kmodes.fit(large_data.view())?;
let elapsed = start.elapsed().as_secs_f64();
println!("n_init={:2}: inertia={:8.2}, time={:5.3}s, time/init={:.3}s",
n_init, result.inertia, elapsed, elapsed / n_init as f64);
}
println!();
println!("=== Example 4: Cluster quality analysis ===");
let kmodes = KModes::new(best_k)
.init_method(InitMethod::Huang)
.random_state(42)
.n_init(5)
.max_iter(100);
let result = kmodes.fit(data.view())?;
let mut cluster_counts = HashMap::new();
for &label in result.labels.iter() {
*cluster_counts.entry(label).or_insert(0) += 1;
}
println!("Cluster size distribution:");
let mut sizes: Vec<_> = cluster_counts.iter().collect();
sizes.sort_by_key(|(k, _)| *k);
let total_points = result.labels.len();
for (cluster_id, count) in sizes {
let percentage = *count as f64 / total_points as f64 * 100.0;
println!(" Cluster {}: {:4} points ({:5.1}%)", cluster_id, count, percentage);
}
let min_size = cluster_counts.values().min().unwrap_or(&0);
let max_size = cluster_counts.values().max().unwrap_or(&0);
let size_ratio = *max_size as f64 / (*min_size).max(1) as f64;
println!("Size balance ratio (max/min): {:.1}", size_ratio);
if size_ratio > 5.0 {
println!("Warning: Clusters are highly imbalanced (ratio > 5.0)");
} else {
println!("Clusters are reasonably balanced");
}
println!();
println!("=== Example 5: Feature importance analysis ===");
let centroids = &result.centroids;
println!("Centroid diversity by feature:");
for feature_idx in 0..data.ncols() {
let mut unique_values = std::collections::HashSet::new();
for cluster_idx in 0..centroids.nrows() {
unique_values.insert(centroids[[cluster_idx, feature_idx]].clone());
}
let diversity = unique_values.len() as f64 / centroids.nrows() as f64;
println!(" Feature {}: {:.2} diversity ({} unique values out of {} clusters)",
feature_idx, diversity, unique_values.len(), centroids.nrows());
}
Ok(())
}
fn generate_synthetic_data(n_samples: usize, n_features: usize, n_categories: usize) -> Array2<String> {
use rand::prelude::*;
let mut rng = StdRng::seed_from_u64(12345);
let mut data = Vec::with_capacity(n_samples * n_features);
for sample_idx in 0..n_samples {
let cluster_bias = sample_idx % 4;
for feature_idx in 0..n_features {
let category = if rng.gen::<f64>() < 0.7 {
(cluster_bias * n_categories / 4 + feature_idx + rng.gen_range(0..2)) % n_categories
} else {
rng.gen_range(0..n_categories)
};
data.push(format!("cat_{}_{}", feature_idx, category));
}
}
Array2::from_shape_vec((n_samples, n_features), data).unwrap()
}