use scirs2_core::ndarray::{s, Array1, Array2, ArrayView1, ArrayView2};
use scirs2_core::numeric::{Float, FromPrimitive};
use std::fmt::Debug;
use crate::error::{ClusteringError, Result};
mod distance_metrics;
mod distance_simd;
mod kmeans;
mod kmeans2;
mod minibatch_kmeans;
mod parallel_kmeans;
mod simd_kmeans;
mod simd_optimizations;
mod weighted_kmeans;
pub use self::distance_metrics::{
create_metric, ChebyshevDistance, CorrelationDistance, CosineDistance,
DistanceMetric as VQDistanceMetric, EuclideanDistance, MahalanobisDistance, ManhattanDistance,
MetricType, MinkowskiDistance,
};
pub use distance_simd::{
distance_to_centroids_simd, pairwise_euclidean_parallel, pairwise_euclidean_simd,
};
pub use kmeans::{
kmeans, kmeans_init, kmeans_plus_plus, kmeans_with_metric, kmeans_with_options, KMeansInit,
KMeansOptions,
};
pub use kmeans2::{kmeans2, kmeans2_str, MinitMethod, MissingMethod};
pub use minibatch_kmeans::*;
pub use parallel_kmeans::{parallel_kmeans, ParallelKMeansOptions};
pub use simd_kmeans::{kmeans_plus_plus_simd, kmeans_simd, mini_batch_kmeans_simd};
pub use simd_optimizations::{
calculate_distortion_simd, compute_centroids_simd, euclidean_distance_simd, vq_simd,
whiten_simd, SimdOptimizationConfig,
};
pub use weighted_kmeans::{weighted_kmeans, weighted_kmeans_plus_plus, WeightedKMeansOptions};
#[allow(dead_code)]
pub fn euclidean_distance<F>(x: ArrayView1<F>, y: ArrayView1<F>) -> F
where
F: Float + FromPrimitive,
{
let mut sum = F::zero();
for i in 0..x.len() {
let diff = x[i] - y[i];
sum = sum + diff * diff;
}
sum.sqrt()
}
#[allow(dead_code)]
pub fn whiten<F>(obs: &Array2<F>) -> Result<Array2<F>>
where
F: Float + FromPrimitive + std::fmt::Debug,
{
let n_samples = obs.shape()[0];
let n_features = obs.shape()[1];
let mut means = Array1::<F>::zeros(n_features);
for j in 0..n_features {
let mut sum = F::zero();
for i in 0..n_samples {
sum = sum + obs[[i, j]];
}
means[j] = sum / F::from(n_samples).expect("Failed to convert to float");
}
let mut stds = Array1::<F>::zeros(n_features);
for j in 0..n_features {
let mut sum = F::zero();
for i in 0..n_samples {
let diff = obs[[i, j]] - means[j];
sum = sum + diff * diff;
}
stds[j] = (sum / F::from(n_samples - 1).expect("Failed to convert to float")).sqrt();
if stds[j] < F::from(1e-10).expect("Failed to convert constant to float") {
stds[j] = F::one();
}
}
let mut whitened = Array2::<F>::zeros((n_samples, n_features));
for i in 0..n_samples {
for j in 0..n_features {
whitened[[i, j]] = (obs[[i, j]] - means[j]) / stds[j];
}
}
Ok(whitened)
}
#[allow(dead_code)]
pub fn vq<F>(data: ArrayView2<F>, centroids: ArrayView2<F>) -> Result<(Array1<usize>, Array1<F>)>
where
F: Float + FromPrimitive + Debug,
{
if data.shape()[1] != centroids.shape()[1] {
return Err(ClusteringError::InvalidInput(format!(
"Observation array and centroid array must have the same number of dimensions. Got {} and {}",
data.shape()[1], centroids.shape()[1]
)));
}
let n_samples = data.shape()[0];
let n_centroids = centroids.shape()[0];
let mut labels = Array1::zeros(n_samples);
let mut distances = Array1::zeros(n_samples);
for i in 0..n_samples {
let point = data.slice(s![i, ..]);
let mut min_dist = F::infinity();
let mut closest_centroid = 0;
for j in 0..n_centroids {
let centroid = centroids.slice(s![j, ..]);
let dist = euclidean_distance(point, centroid);
if dist < min_dist {
min_dist = dist;
closest_centroid = j;
}
}
labels[i] = closest_centroid;
distances[i] = min_dist;
}
Ok((labels, distances))
}