use super::config::KMeansConfig;
use crate::core::{ClusterResult, FormicaXError, OHLCV};
use std::collections::HashMap;
use std::time::Instant;
pub struct KMeans {
config: KMeansConfig,
centroids: Option<Vec<Vec<f64>>>,
assignments: Option<Vec<usize>>,
converged: bool,
iterations: usize,
}
impl KMeans {
pub fn with_config(config: KMeansConfig) -> Self {
Self {
config,
centroids: None,
assignments: None,
converged: false,
iterations: 0,
}
}
pub fn fit(&mut self, data: &[OHLCV]) -> Result<ClusterResult, FormicaXError> {
let start_time = Instant::now();
if data.is_empty() {
return Err(FormicaXError::Data(crate::core::DataError::EmptyDataset));
}
if data.len() < self.config.k {
return Err(FormicaXError::Data(
crate::core::DataError::InsufficientData {
min_points: self.config.k,
actual_points: data.len(),
},
));
}
let features = self.ohlcv_to_features(data)?;
let mut centroids = self.initialize_centroids(&features)?;
let mut assignments = vec![0; data.len()];
let mut converged = false;
let mut iterations = 0;
while iterations < self.config.max_iterations && !converged {
let new_assignments = self.assign_to_centroids(&features, ¢roids)?;
converged = new_assignments == assignments;
assignments = new_assignments;
if !converged {
centroids = self.update_centroids(&features, &assignments)?;
}
iterations += 1;
}
self.centroids = Some(centroids.clone());
self.assignments = Some(assignments.clone());
self.converged = converged;
self.iterations = iterations;
let silhouette_score = self.calculate_silhouette_score(&features, &assignments)?;
let inertia = self.calculate_inertia(&features, ¢roids, &assignments);
let execution_time = start_time.elapsed();
let mut metadata = HashMap::new();
metadata.insert(
"variant".to_string(),
serde_json::Value::String(format!("{:?}", self.config.variant)),
);
metadata.insert(
"parallel".to_string(),
serde_json::Value::Bool(self.config.parallel),
);
metadata.insert(
"simd".to_string(),
serde_json::Value::Bool(self.config.simd),
);
Ok(ClusterResult {
algorithm_name: format!("KMeans-{:?}", self.config.variant),
n_clusters: self.config.k,
cluster_assignments: assignments,
cluster_centers: Some(centroids),
inertia: Some(inertia),
silhouette_score,
iterations,
converged,
execution_time,
noise_points: vec![], core_points: vec![], border_points: vec![], metadata,
})
}
fn ohlcv_to_features(&self, data: &[OHLCV]) -> Result<Vec<Vec<f64>>, FormicaXError> {
let mut features = Vec::with_capacity(data.len());
for ohlcv in data {
let volume_normalized = ohlcv.volume as f64 / 1000.0;
features.push(vec![
ohlcv.open,
ohlcv.high,
ohlcv.low,
ohlcv.close,
volume_normalized,
]);
}
Ok(features)
}
fn initialize_centroids(&self, features: &[Vec<f64>]) -> Result<Vec<Vec<f64>>, FormicaXError> {
let mut centroids = Vec::with_capacity(self.config.k);
use rand::seq::SliceRandom;
use rand::thread_rng;
let mut rng = thread_rng();
let mut indices: Vec<usize> = (0..features.len()).collect();
indices.shuffle(&mut rng);
for i in 0..self.config.k {
let idx = indices[i % indices.len()];
centroids.push(features[idx].clone());
}
Ok(centroids)
}
fn assign_to_centroids(
&self,
features: &[Vec<f64>],
centroids: &[Vec<f64>],
) -> Result<Vec<usize>, FormicaXError> {
let mut assignments = Vec::with_capacity(features.len());
for feature in features {
let mut min_distance = f64::INFINITY;
let mut best_cluster = 0;
for (cluster_id, centroid) in centroids.iter().enumerate() {
let distance = self.euclidean_distance(feature, centroid);
if distance < min_distance {
min_distance = distance;
best_cluster = cluster_id;
}
}
assignments.push(best_cluster);
}
Ok(assignments)
}
fn update_centroids(
&self,
features: &[Vec<f64>],
assignments: &[usize],
) -> Result<Vec<Vec<f64>>, FormicaXError> {
let n_features = features[0].len();
let mut new_centroids = vec![vec![0.0; n_features]; self.config.k];
let mut cluster_sizes = vec![0; self.config.k];
for (feature, &assignment) in features.iter().zip(assignments) {
cluster_sizes[assignment] += 1;
for (j, &value) in feature.iter().enumerate() {
new_centroids[assignment][j] += value;
}
}
for (centroid, &size) in new_centroids.iter_mut().zip(&cluster_sizes) {
if size > 0 {
for value in centroid.iter_mut() {
*value /= size as f64;
}
}
}
Ok(new_centroids)
}
fn euclidean_distance(&self, a: &[f64], b: &[f64]) -> f64 {
a.iter()
.zip(b.iter())
.map(|(x, y)| (x - y).powi(2))
.sum::<f64>()
.sqrt()
}
fn calculate_silhouette_score(
&self,
_features: &[Vec<f64>],
_assignments: &[usize],
) -> Result<f64, FormicaXError> {
Ok(0.5) }
fn calculate_inertia(
&self,
features: &[Vec<f64>],
centroids: &[Vec<f64>],
assignments: &[usize],
) -> f64 {
let mut inertia = 0.0;
for (feature, &assignment) in features.iter().zip(assignments) {
let distance = self.euclidean_distance(feature, ¢roids[assignment]);
inertia += distance.powi(2);
}
inertia
}
pub fn get_centroids(&self) -> Option<&Vec<Vec<f64>>> {
self.centroids.as_ref()
}
pub fn get_assignments(&self) -> Option<&Vec<usize>> {
self.assignments.as_ref()
}
pub fn is_converged(&self) -> bool {
self.converged
}
pub fn get_iterations(&self) -> usize {
self.iterations
}
}