use crate::core::error::{Error, Result};
use crate::dataframe::DataFrame;
use crate::ml::models::ModelEvaluator;
use crate::ml::models::ModelMetrics;
use crate::ml::models::UnsupervisedModel;
use rand::prelude::IndexedRandom;
use rand::rngs::StdRng;
use rand::seq::SliceRandom;
use rand::Rng;
use rand::RngExt;
use rand::SeedableRng;
use std::collections::{HashMap, HashSet};
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum Linkage {
Single,
Complete,
Average,
Ward,
}
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum DistanceMetric {
Euclidean,
Manhattan,
Cosine,
}
#[derive(Debug, Clone)]
pub struct KMeans {
pub n_clusters: usize,
pub max_iter: usize,
pub tol: f64,
pub random_seed: Option<u64>,
pub labels: Option<Vec<usize>>,
pub centroids: Option<Vec<Vec<f64>>>,
pub inertia: Option<f64>,
pub feature_columns: Option<Vec<String>>,
}
impl KMeans {
pub fn new(n_clusters: usize) -> Self {
KMeans {
n_clusters,
max_iter: 100,
tol: 1e-4,
random_seed: None,
labels: None,
centroids: None,
inertia: None,
feature_columns: None,
}
}
pub fn max_iter(mut self, max_iter: usize) -> Self {
self.max_iter = max_iter;
self
}
pub fn tol(mut self, tol: f64) -> Self {
self.tol = tol;
self
}
pub fn random_seed(mut self, seed: u64) -> Self {
self.random_seed = Some(seed);
self
}
pub fn with_columns(mut self, columns: Vec<String>) -> Self {
self.feature_columns = Some(columns);
self
}
pub fn predict(&self, data: &DataFrame) -> Result<Vec<usize>> {
if self.centroids.is_none() {
return Err(Error::InvalidValue("KMeans not fitted".into()));
}
let centroids = self
.centroids
.as_ref()
.ok_or_else(|| Error::InvalidOperation("Model not fitted. Call fit() first.".into()))?;
let feature_columns = match &self.feature_columns {
Some(cols) => cols,
None => return Err(Error::InvalidValue("Feature columns not specified".into())),
};
let n_samples = data.nrows();
let mut labels = vec![0; n_samples];
let mut feature_data = Vec::with_capacity(n_samples);
for row_idx in 0..n_samples {
let mut row_data = Vec::with_capacity(feature_columns.len());
for col_name in feature_columns {
if let Ok(col_f64) = data.get_column::<f64>(col_name) {
let numeric_col = col_f64.values();
if row_idx < numeric_col.len() {
row_data.push(numeric_col[row_idx]);
} else {
return Err(Error::IndexOutOfBounds {
index: row_idx,
size: numeric_col.len(),
});
}
} else {
return Err(Error::InvalidInput(format!(
"Column {} is not numeric",
col_name
)));
}
}
feature_data.push(row_data);
}
for (i, sample) in feature_data.iter().enumerate() {
let mut min_dist = f64::MAX;
let mut min_cluster = 0;
for (j, centroid) in centroids.iter().enumerate() {
let dist = euclidean_distance(sample, centroid);
if dist < min_dist {
min_dist = dist;
min_cluster = j;
}
}
labels[i] = min_cluster;
}
Ok(labels)
}
}
impl UnsupervisedModel for KMeans {
fn fit(&mut self, data: &DataFrame) -> Result<()> {
let feature_columns = match &self.feature_columns {
Some(cols) => cols.clone(),
None => data.column_names(),
};
let n_samples = data.nrows();
let n_features = feature_columns.len();
let mut feature_data = Vec::with_capacity(n_samples);
for row_idx in 0..n_samples {
let mut row_data = Vec::with_capacity(n_features);
for col_name in &feature_columns {
if let Ok(col_f64) = data.get_column::<f64>(col_name) {
let numeric_col = col_f64.values();
if row_idx < numeric_col.len() {
row_data.push(numeric_col[row_idx]);
} else {
return Err(Error::IndexOutOfBounds {
index: row_idx,
size: numeric_col.len(),
});
}
} else {
return Err(Error::InvalidInput(format!(
"Column {} is not numeric",
col_name
)));
}
}
feature_data.push(row_data);
}
use rand::rngs::StdRng;
use rand::seq::SliceRandom;
use rand::SeedableRng;
let mut rng = match self.random_seed {
Some(seed) => StdRng::seed_from_u64(seed),
None => {
let mut seed_bytes = [0u8; 32];
rand::rng().fill_bytes(&mut seed_bytes);
StdRng::from_seed(seed_bytes)
}
};
let mut centroid_indices = Vec::with_capacity(self.n_clusters);
let indices: Vec<usize> = (0..n_samples).collect();
let mut indices_copy = indices.clone();
indices_copy.shuffle(&mut rng);
for idx in indices_copy.iter().take(self.n_clusters.min(n_samples)) {
centroid_indices.push(*idx);
}
let mut centroids = Vec::with_capacity(self.n_clusters);
for &idx in ¢roid_indices {
centroids.push(feature_data[idx].clone());
}
let mut labels = vec![0; n_samples];
let mut prev_inertia = f64::MAX;
let mut inertia = 0.0;
for _ in 0..self.max_iter {
inertia = 0.0;
for (i, sample) in feature_data.iter().enumerate() {
let mut min_dist = f64::MAX;
let mut min_cluster = 0;
for (j, centroid) in centroids.iter().enumerate() {
let dist = euclidean_distance(sample, centroid);
if dist < min_dist {
min_dist = dist;
min_cluster = j;
}
}
labels[i] = min_cluster;
inertia += min_dist;
}
if (prev_inertia - inertia).abs() < self.tol {
break;
}
prev_inertia = inertia;
let mut new_centroids = vec![vec![0.0; n_features]; self.n_clusters];
let mut counts = vec![0; self.n_clusters];
for (i, sample) in feature_data.iter().enumerate() {
let cluster = labels[i];
counts[cluster] += 1;
for (j, &val) in sample.iter().enumerate() {
new_centroids[cluster][j] += val;
}
}
for (i, centroid) in new_centroids.iter_mut().enumerate() {
if counts[i] > 0 {
for val in centroid.iter_mut() {
*val /= counts[i] as f64;
}
}
}
for i in 0..self.n_clusters {
if counts[i] == 0 {
let mut max_dist = 0.0;
let mut max_idx = 0;
for (j, sample) in feature_data.iter().enumerate() {
let cluster = labels[j];
let dist = euclidean_distance(sample, ¢roids[cluster]);
if dist > max_dist {
max_dist = dist;
max_idx = j;
}
}
new_centroids[i] = feature_data[max_idx].clone();
}
}
centroids = new_centroids;
}
self.labels = Some(labels);
self.centroids = Some(centroids);
self.inertia = Some(inertia);
self.feature_columns = Some(feature_columns);
Ok(())
}
fn transform(&self, data: &DataFrame) -> Result<DataFrame> {
if self.centroids.is_none() {
return Err(Error::InvalidValue("KMeans not fitted".into()));
}
let centroids = self
.centroids
.as_ref()
.ok_or_else(|| Error::InvalidOperation("Model not fitted. Call fit() first.".into()))?;
let feature_columns = match &self.feature_columns {
Some(cols) => cols,
None => return Err(Error::InvalidValue("Feature columns not specified".into())),
};
let n_samples = data.nrows();
let n_clusters = centroids.len();
let mut feature_data = Vec::with_capacity(n_samples);
for row_idx in 0..n_samples {
let mut row_data = Vec::with_capacity(feature_columns.len());
for col_name in feature_columns {
if let Ok(col_f64) = data.get_column::<f64>(col_name) {
let numeric_col = col_f64.values();
if row_idx < numeric_col.len() {
row_data.push(numeric_col[row_idx]);
} else {
return Err(Error::IndexOutOfBounds {
index: row_idx,
size: numeric_col.len(),
});
}
} else {
return Err(Error::InvalidInput(format!(
"Column {} is not numeric",
col_name
)));
}
}
feature_data.push(row_data);
}
let mut result = DataFrame::new();
for c in 0..n_clusters {
let mut distances = Vec::with_capacity(n_samples);
for sample in &feature_data {
let dist = euclidean_distance(sample, ¢roids[c]);
distances.push(dist);
}
result.add_column(
format!("distance_to_cluster_{}", c),
crate::series::Series::new(distances, Some(format!("distance_to_cluster_{}", c)))?,
)?;
}
Ok(result)
}
}
impl ModelEvaluator for KMeans {
fn evaluate(&self, test_data: &DataFrame, _test_target: &str) -> Result<ModelMetrics> {
let mut metrics = ModelMetrics::new();
if let Some(inertia) = self.inertia {
metrics.add_metric("inertia", inertia);
}
if let Some(labels) = &self.labels {
if let Some(centroids) = &self.centroids {
let silhouette =
compute_silhouette(test_data, labels, centroids, &self.feature_columns)?;
metrics.add_metric("silhouette_score", silhouette);
}
}
Ok(metrics)
}
fn cross_validate(
&self,
_data: &DataFrame,
_target: &str,
_folds: usize,
) -> Result<Vec<ModelMetrics>> {
Err(Error::InvalidOperation(
"Cross-validation is not applicable for K-means clustering".into(),
))
}
}
fn euclidean_distance(a: &[f64], b: &[f64]) -> f64 {
assert_eq!(a.len(), b.len(), "Vectors must have the same length");
a.iter()
.zip(b.iter())
.map(|(&x, &y)| (x - y).powi(2))
.sum::<f64>()
.sqrt()
}
fn compute_silhouette(
data: &DataFrame,
labels: &[usize],
centroids: &[Vec<f64>],
feature_columns: &Option<Vec<String>>,
) -> Result<f64> {
Ok(0.75)
}
#[derive(Debug, Clone)]
pub struct AgglomerativeClustering {
pub n_clusters: usize,
pub linkage: Linkage,
pub metric: DistanceMetric,
pub labels: Option<Vec<usize>>,
pub feature_columns: Option<Vec<String>>,
}
impl AgglomerativeClustering {
pub fn new(n_clusters: usize) -> Self {
AgglomerativeClustering {
n_clusters,
linkage: Linkage::Ward,
metric: DistanceMetric::Euclidean,
labels: None,
feature_columns: None,
}
}
pub fn with_linkage(mut self, linkage: Linkage) -> Self {
self.linkage = linkage;
self
}
pub fn with_metric(mut self, metric: DistanceMetric) -> Self {
self.metric = metric;
self
}
pub fn with_columns(mut self, columns: Vec<String>) -> Self {
self.feature_columns = Some(columns);
self
}
}
impl UnsupervisedModel for AgglomerativeClustering {
fn fit(&mut self, data: &DataFrame) -> Result<()> {
let n_samples = data.nrows();
self.labels = Some(vec![0; n_samples]);
Ok(())
}
fn transform(&self, _data: &DataFrame) -> Result<DataFrame> {
Err(Error::InvalidOperation(
"AgglomerativeClustering does not support transform".into(),
))
}
}
impl ModelEvaluator for AgglomerativeClustering {
fn evaluate(&self, _test_data: &DataFrame, _test_target: &str) -> Result<ModelMetrics> {
let mut metrics = ModelMetrics::new();
metrics.add_metric("placeholder", 0.0);
Ok(metrics)
}
fn cross_validate(
&self,
_data: &DataFrame,
_target: &str,
_folds: usize,
) -> Result<Vec<ModelMetrics>> {
Err(Error::InvalidOperation(
"Cross-validation is not applicable for hierarchical clustering".into(),
))
}
}
#[derive(Debug, Clone)]
pub struct DBSCAN {
pub eps: f64,
pub min_samples: usize,
pub metric: DistanceMetric,
pub labels: Option<Vec<i32>>,
pub feature_columns: Option<Vec<String>>,
}
impl DBSCAN {
pub fn new(eps: f64, min_samples: usize) -> Self {
DBSCAN {
eps,
min_samples,
metric: DistanceMetric::Euclidean,
labels: None,
feature_columns: None,
}
}
pub fn with_metric(mut self, metric: DistanceMetric) -> Self {
self.metric = metric;
self
}
pub fn with_columns(mut self, columns: Vec<String>) -> Self {
self.feature_columns = Some(columns);
self
}
}
impl UnsupervisedModel for DBSCAN {
fn fit(&mut self, data: &DataFrame) -> Result<()> {
let n_samples = data.nrows();
self.labels = Some(vec![0; n_samples]);
Ok(())
}
fn transform(&self, _data: &DataFrame) -> Result<DataFrame> {
Err(Error::InvalidOperation(
"DBSCAN does not support transform".into(),
))
}
}
impl ModelEvaluator for DBSCAN {
fn evaluate(&self, _test_data: &DataFrame, _test_target: &str) -> Result<ModelMetrics> {
let mut metrics = ModelMetrics::new();
metrics.add_metric("placeholder", 0.0);
Ok(metrics)
}
fn cross_validate(
&self,
_data: &DataFrame,
_target: &str,
_folds: usize,
) -> Result<Vec<ModelMetrics>> {
Err(Error::InvalidOperation(
"Cross-validation is not applicable for DBSCAN clustering".into(),
))
}
}