use crate::cluster::traits::metrics::HCVScore;
use crate::cluster::validation::{validate_cluster_dtype, validate_data_2d, validate_labels};
use numr::dtype::DType;
use numr::error::Result;
use numr::ops::{
CompareOps, ConditionalOps, DistanceMetric, DistanceOps, IndexingOps, ReduceOps, ScalarOps,
ShapeOps, TensorOps, TypeConversionOps, UnaryOps, UtilityOps,
};
use numr::runtime::{Runtime, RuntimeClient};
use numr::tensor::Tensor;
pub fn silhouette_samples_impl<R, C>(
client: &C,
data: &Tensor<R>,
labels: &Tensor<R>,
metric: DistanceMetric,
) -> Result<Tensor<R>>
where
R: Runtime<DType = DType>,
C: DistanceOps<R>
+ ReduceOps<R>
+ ScalarOps<R>
+ TensorOps<R>
+ CompareOps<R>
+ ConditionalOps<R>
+ ShapeOps<R>
+ UnaryOps<R>
+ UtilityOps<R>
+ IndexingOps<R>
+ TypeConversionOps<R>
+ RuntimeClient<R>,
{
validate_cluster_dtype(data.dtype(), "silhouette_samples")?;
validate_data_2d(data.shape(), "silhouette_samples")?;
validate_labels(labels.shape(), labels.dtype(), "silhouette_samples")?;
let n = data.shape()[0];
let dtype = data.dtype();
let device = data.device();
let dists = client.cdist(data, data, metric)?;
let labels_f = client.cast(labels, dtype)?;
let labels_row = labels_f.unsqueeze(1)?; let labels_col = labels_f.unsqueeze(0)?; let same_f = client.eq(&labels_row, &labels_col)?;
let dist_same = client.mul(&dists, &same_f)?; let sum_same = client.sum(&dist_same, &[1], false)?; let count_same = client.sum(&same_f, &[1], false)?; let count_same_excl = client.sub_scalar(&count_same, 1.0)?;
let count_safe = client.maximum(&count_same_excl, &Tensor::<R>::ones(&[n], dtype, device))?;
let a_i = client.div(&sum_same, &count_safe)?;
let max_label: f64 = client.max(&labels_f, &[0], false)?.item()?;
let n_clusters = (max_label as usize) + 1;
let inf_tensor = Tensor::<R>::full_scalar(&[n], dtype, f64::INFINITY, device);
let mut b_i = inf_tensor;
for k in 0..n_clusters {
let k_tensor = Tensor::<R>::full_scalar(&[1], dtype, k as f64, device);
let mask_k = client.eq(&labels_col, &k_tensor)?; let mask_k_f = client.cast(&mask_k, dtype)?;
let dist_to_k = client.mul(&dists, &mask_k_f)?; let sum_to_k = client.sum(&dist_to_k, &[1], false)?; let count_k = client.sum(&mask_k_f, &[1], false)?; let count_k_safe = client.maximum(&count_k, &Tensor::<R>::ones(&[n], dtype, device))?;
let mean_to_k = client.div(&sum_to_k, &count_k_safe)?;
let in_k = client.eq(
&labels_f,
&Tensor::<R>::full_scalar(&[1], dtype, k as f64, device),
)?;
let inf_n = Tensor::<R>::full_scalar(&[n], dtype, f64::INFINITY, device);
let mean_to_k = client.where_cond(&in_k, &inf_n, &mean_to_k)?;
b_i = client.minimum(&b_i, &mean_to_k)?;
}
let num = client.sub(&b_i, &a_i)?;
let den = client.maximum(&a_i, &b_i)?;
let den_safe = client.maximum(&den, &Tensor::<R>::full_scalar(&[n], dtype, 1e-10, device))?;
client.div(&num, &den_safe)
}
pub fn silhouette_score_impl<R, C>(
client: &C,
data: &Tensor<R>,
labels: &Tensor<R>,
metric: DistanceMetric,
) -> Result<Tensor<R>>
where
R: Runtime<DType = DType>,
C: DistanceOps<R>
+ ReduceOps<R>
+ ScalarOps<R>
+ TensorOps<R>
+ CompareOps<R>
+ ConditionalOps<R>
+ ShapeOps<R>
+ UnaryOps<R>
+ UtilityOps<R>
+ IndexingOps<R>
+ TypeConversionOps<R>
+ RuntimeClient<R>,
{
let samples = silhouette_samples_impl(client, data, labels, metric)?;
client.mean(&samples, &[0], false)
}
pub fn calinski_harabasz_score_impl<R, C>(
client: &C,
data: &Tensor<R>,
labels: &Tensor<R>,
) -> Result<Tensor<R>>
where
R: Runtime<DType = DType>,
C: DistanceOps<R>
+ ReduceOps<R>
+ ScalarOps<R>
+ TensorOps<R>
+ CompareOps<R>
+ ConditionalOps<R>
+ ShapeOps<R>
+ UnaryOps<R>
+ UtilityOps<R>
+ IndexingOps<R>
+ TypeConversionOps<R>
+ RuntimeClient<R>,
{
validate_cluster_dtype(data.dtype(), "calinski_harabasz_score")?;
validate_data_2d(data.shape(), "calinski_harabasz_score")?;
validate_labels(labels.shape(), labels.dtype(), "calinski_harabasz_score")?;
let n = data.shape()[0];
let d = data.shape()[1];
let dtype = data.dtype();
let device = data.device();
let labels_f = client.cast(labels, dtype)?;
let max_label: f64 = client.max(&labels_f, &[0], false)?.item()?;
let k = (max_label as usize) + 1;
if k <= 1 || n <= k {
return Ok(Tensor::<R>::full_scalar(&[], dtype, 0.0, device));
}
let global_centroid = client.mean(data, &[0], true)?;
let labels_exp = labels.unsqueeze(1)?.broadcast_to(&[n, d])?;
let dst = Tensor::<R>::zeros(&[k, d], dtype, device);
let cluster_sums = client.scatter_reduce(
&dst,
0,
&labels_exp,
data,
numr::ops::ScatterReduceOp::Sum,
false,
)?;
let counts = client.bincount(labels, None, k)?;
let counts_f = client.cast(&counts, dtype)?;
let counts_safe = client.maximum(&counts_f, &Tensor::<R>::ones(&[k], dtype, device))?;
let counts_exp = counts_safe.unsqueeze(1)?.broadcast_to(&[k, d])?;
let centroids = client.div(&cluster_sums, &counts_exp)?;
let diff = client.sub(¢roids, &global_centroid)?; let sq_diff = client.mul(&diff, &diff)?;
let sq_dist_per_cluster = client.sum(&sq_diff, &[1], false)?; let weighted = client.mul(&sq_dist_per_cluster, &counts_f)?;
let bgd = client.sum(&weighted, &[0], false)?;
let point_centroids = client.index_select(¢roids, 0, labels)?; let point_diff = client.sub(data, &point_centroids)?;
let point_sq = client.mul(&point_diff, &point_diff)?;
let wgd = client.sum(&point_sq, &[0, 1], false)?;
let bgd_norm = client.div_scalar(&bgd, (k - 1) as f64)?;
let wgd_norm = client.div_scalar(&wgd, (n - k) as f64)?;
client.div(&bgd_norm, &wgd_norm)
}
pub fn davies_bouldin_score_impl<R, C>(
client: &C,
data: &Tensor<R>,
labels: &Tensor<R>,
) -> Result<Tensor<R>>
where
R: Runtime<DType = DType>,
C: DistanceOps<R>
+ ReduceOps<R>
+ ScalarOps<R>
+ TensorOps<R>
+ CompareOps<R>
+ ConditionalOps<R>
+ ShapeOps<R>
+ UnaryOps<R>
+ UtilityOps<R>
+ IndexingOps<R>
+ TypeConversionOps<R>
+ RuntimeClient<R>,
{
validate_cluster_dtype(data.dtype(), "davies_bouldin_score")?;
validate_data_2d(data.shape(), "davies_bouldin_score")?;
validate_labels(labels.shape(), labels.dtype(), "davies_bouldin_score")?;
let n = data.shape()[0];
let d = data.shape()[1];
let dtype = data.dtype();
let device = data.device();
let labels_f = client.cast(labels, dtype)?;
let max_label: f64 = client.max(&labels_f, &[0], false)?.item()?;
let k = (max_label as usize) + 1;
if k <= 1 {
return Ok(Tensor::<R>::full_scalar(&[], dtype, 0.0, device));
}
let labels_exp = labels.unsqueeze(1)?.broadcast_to(&[n, d])?;
let dst = Tensor::<R>::zeros(&[k, d], dtype, device);
let cluster_sums = client.scatter_reduce(
&dst,
0,
&labels_exp,
data,
numr::ops::ScatterReduceOp::Sum,
false,
)?;
let counts = client.bincount(labels, None, k)?;
let counts_f = client.cast(&counts, dtype)?;
let counts_safe = client.maximum(&counts_f, &Tensor::<R>::ones(&[k], dtype, device))?;
let counts_exp = counts_safe.unsqueeze(1)?.broadcast_to(&[k, d])?;
let centroids = client.div(&cluster_sums, &counts_exp)?;
let point_centroids = client.index_select(¢roids, 0, labels)?; let point_diff = client.sub(data, &point_centroids)?;
let point_dist =
client.sqrt(&client.sum(&client.mul(&point_diff, &point_diff)?, &[1], false)?)?;
let dst_s = Tensor::<R>::zeros(&[k], dtype, device);
let labels_1d = labels.reshape(&[n])?;
let s_sums = client.scatter_reduce(
&dst_s,
0,
&labels_1d,
&point_dist,
numr::ops::ScatterReduceOp::Sum,
false,
)?;
let s_i = client.div(&s_sums, &counts_safe)?;
let inter_dists = client.cdist(¢roids, ¢roids, DistanceMetric::Euclidean)?;
let s_row = s_i.unsqueeze(1)?.broadcast_to(&[k, k])?; let s_col = s_i.unsqueeze(0)?.broadcast_to(&[k, k])?; let s_sum = client.add(&s_row, &s_col)?;
let inter_safe = client.add(
&inter_dists,
&Tensor::<R>::full_scalar(&[1], dtype, 1e-10, device),
)?;
let r_matrix = client.div(&s_sum, &inter_safe)?;
let idx = client.arange(0.0, k as f64, 1.0, dtype)?;
let idx_row = idx.unsqueeze(1)?;
let idx_col = idx.unsqueeze(0)?;
let diag = client.eq(&idx_row, &idx_col)?;
let diag_u8 = client.cast(&diag, DType::U8)?;
let r_matrix = client.masked_fill(&r_matrix, &diag_u8, 0.0)?;
let max_r = client.max(&r_matrix, &[1], false)?; client.mean(&max_r, &[0], false)
}
fn contingency_matrix<R, C>(
client: &C,
labels_true: &Tensor<R>,
labels_pred: &Tensor<R>,
) -> Result<(Tensor<R>, usize, usize)>
where
R: Runtime<DType = DType>,
C: ReduceOps<R>
+ ScalarOps<R>
+ TensorOps<R>
+ CompareOps<R>
+ ConditionalOps<R>
+ ShapeOps<R>
+ UnaryOps<R>
+ UtilityOps<R>
+ IndexingOps<R>
+ TypeConversionOps<R>
+ RuntimeClient<R>,
{
let dtype = DType::F64;
let device = labels_true.device();
let n = labels_true.shape()[0];
let true_f = client.cast(labels_true, dtype)?;
let pred_f = client.cast(labels_pred, dtype)?;
let max_true: f64 = client.max(&true_f, &[0], false)?.item()?;
let max_pred: f64 = client.max(&pred_f, &[0], false)?.item()?;
let n_true = (max_true as usize) + 1;
let n_pred = (max_pred as usize) + 1;
let true_scaled = client.mul_scalar(&true_f, n_pred as f64)?;
let flat_idx = client.add(&true_scaled, &pred_f)?;
let flat_idx_i64 = client.cast(&flat_idx, DType::I64)?;
let ones = Tensor::<R>::ones(&[n], dtype, device);
let contingency_flat = client.bincount(&flat_idx_i64, Some(&ones), n_true * n_pred)?;
let contingency = contingency_flat.reshape(&[n_true, n_pred])?;
Ok((contingency, n_true, n_pred))
}
pub fn adjusted_rand_score_impl<R, C>(
client: &C,
labels_true: &Tensor<R>,
labels_pred: &Tensor<R>,
) -> Result<Tensor<R>>
where
R: Runtime<DType = DType>,
C: ReduceOps<R>
+ ScalarOps<R>
+ TensorOps<R>
+ CompareOps<R>
+ ConditionalOps<R>
+ ShapeOps<R>
+ UnaryOps<R>
+ UtilityOps<R>
+ IndexingOps<R>
+ TypeConversionOps<R>
+ RuntimeClient<R>,
{
let dtype = DType::F64;
let device = labels_true.device();
let (contingency, _n_true, _n_pred) = contingency_matrix(client, labels_true, labels_pred)?;
let c_minus_1 = client.sub_scalar(&contingency, 1.0)?;
let comb_all = client.mul(&contingency, &c_minus_1)?;
let comb_all = client.div_scalar(&comb_all, 2.0)?;
let sum_comb = client.sum(&comb_all, &[0, 1], false)?;
let a = client.sum(&contingency, &[1], false)?; let b = client.sum(&contingency, &[0], false)?;
let a_minus_1 = client.sub_scalar(&a, 1.0)?;
let comb_a = client.div_scalar(&client.mul(&a, &a_minus_1)?, 2.0)?;
let sum_comb_a = client.sum(&comb_a, &[0], false)?;
let b_minus_1 = client.sub_scalar(&b, 1.0)?;
let comb_b = client.div_scalar(&client.mul(&b, &b_minus_1)?, 2.0)?;
let sum_comb_b = client.sum(&comb_b, &[0], false)?;
let n_f: f64 = labels_true.shape()[0] as f64;
let n_comb = n_f * (n_f - 1.0) / 2.0;
let n_comb_t = Tensor::<R>::full_scalar(&[], dtype, n_comb, device);
let expected = client.div(&client.mul(&sum_comb_a, &sum_comb_b)?, &n_comb_t)?;
let max_index = client.div_scalar(&client.add(&sum_comb_a, &sum_comb_b)?, 2.0)?;
let numerator = client.sub(&sum_comb, &expected)?;
let denominator = client.sub(&max_index, &expected)?;
let denom_safe = client.maximum(
&client.abs(&denominator)?,
&Tensor::<R>::full_scalar(&[], dtype, 1e-15, device),
)?;
let sign = client.div(&denominator, &denom_safe)?;
let denom_with_sign = client.mul(&denom_safe, &sign)?;
client.div(&numerator, &denom_with_sign)
}
fn entropy_from_counts<R, C>(client: &C, counts: &Tensor<R>, n: f64) -> Result<Tensor<R>>
where
R: Runtime<DType = DType>,
C: ReduceOps<R>
+ ScalarOps<R>
+ TensorOps<R>
+ CompareOps<R>
+ ConditionalOps<R>
+ UnaryOps<R>
+ RuntimeClient<R>,
{
let dtype = counts.dtype();
let device = counts.device();
let n_t = Tensor::<R>::full_scalar(&[1], dtype, n, device);
let p = client.div(counts, &n_t)?;
let zero = Tensor::<R>::zeros(&[1], dtype, device);
let mask = client.gt(&p, &zero)?;
let safe_p = client.maximum(&p, &Tensor::<R>::full_scalar(&[1], dtype, 1e-15, device))?;
let log_p = client.log(&safe_p)?;
let p_log_p = client.mul(&p, &log_p)?;
let masked = client.where_cond(&mask, &p_log_p, &zero)?;
let neg_sum = client.sum(&masked, &[0], false)?;
client.mul_scalar(&neg_sum, -1.0)
}
pub fn normalized_mutual_info_score_impl<R, C>(
client: &C,
labels_true: &Tensor<R>,
labels_pred: &Tensor<R>,
) -> Result<Tensor<R>>
where
R: Runtime<DType = DType>,
C: ReduceOps<R>
+ ScalarOps<R>
+ TensorOps<R>
+ CompareOps<R>
+ ConditionalOps<R>
+ ShapeOps<R>
+ UnaryOps<R>
+ UtilityOps<R>
+ IndexingOps<R>
+ TypeConversionOps<R>
+ RuntimeClient<R>,
{
let dtype = DType::F64;
let device = labels_true.device();
let n = labels_true.shape()[0] as f64;
let (contingency, _n_true, _n_pred) = contingency_matrix(client, labels_true, labels_pred)?;
let a = client.sum(&contingency, &[1], false)?; let b = client.sum(&contingency, &[0], false)?;
let h_true = entropy_from_counts(client, &a, n)?;
let h_pred = entropy_from_counts(client, &b, n)?;
let n_t = Tensor::<R>::full_scalar(&[1], dtype, n, device);
let zero = Tensor::<R>::zeros(&[1], dtype, device);
let mask = client.gt(&contingency, &zero)?;
let a_col = a.unsqueeze(1)?;
let b_row = b.unsqueeze(0)?;
let outer = client.mul(&a_col, &b_row)?;
let numer = client.mul_scalar(&contingency, n)?;
let outer_safe = client.maximum(
&outer,
&Tensor::<R>::full_scalar(&[1], dtype, 1e-15, device),
)?;
let ratio = client.div(&numer, &outer_safe)?;
let safe_ratio = client.maximum(
&ratio,
&Tensor::<R>::full_scalar(&[1], dtype, 1e-15, device),
)?;
let log_ratio = client.log(&safe_ratio)?;
let p_ij = client.div(&contingency, &n_t)?;
let mi_terms = client.mul(&p_ij, &log_ratio)?;
let mi_masked = client.where_cond(&mask, &mi_terms, &zero)?;
let mi = client.sum(&mi_masked, &[0, 1], false)?;
let h_sum = client.add(&h_true, &h_pred)?;
let h_safe = client.maximum(&h_sum, &Tensor::<R>::full_scalar(&[], dtype, 1e-15, device))?;
let nmi = client.div(&client.mul_scalar(&mi, 2.0)?, &h_safe)?;
Ok(nmi)
}
pub fn homogeneity_completeness_v_measure_impl<R, C>(
client: &C,
labels_true: &Tensor<R>,
labels_pred: &Tensor<R>,
) -> Result<HCVScore<R>>
where
R: Runtime<DType = DType>,
C: ReduceOps<R>
+ ScalarOps<R>
+ TensorOps<R>
+ CompareOps<R>
+ ConditionalOps<R>
+ ShapeOps<R>
+ UnaryOps<R>
+ UtilityOps<R>
+ IndexingOps<R>
+ TypeConversionOps<R>
+ RuntimeClient<R>,
{
let dtype = DType::F64;
let device = labels_true.device();
let n = labels_true.shape()[0] as f64;
let (contingency, _n_true, _n_pred) = contingency_matrix(client, labels_true, labels_pred)?;
let a = client.sum(&contingency, &[1], false)?; let b = client.sum(&contingency, &[0], false)?;
let h_c = entropy_from_counts(client, &a, n)?; let h_k = entropy_from_counts(client, &b, n)?;
let n_t = Tensor::<R>::full_scalar(&[1], dtype, n, device);
let zero = Tensor::<R>::zeros(&[1], dtype, device);
let mask = client.gt(&contingency, &zero)?;
let b_row = b.unsqueeze(0)?;
let b_safe = client.maximum(
&b_row,
&Tensor::<R>::full_scalar(&[1], dtype, 1e-15, device),
)?;
let ratio_ck = client.div(&contingency, &b_safe)?;
let safe_ratio_ck = client.maximum(
&ratio_ck,
&Tensor::<R>::full_scalar(&[1], dtype, 1e-15, device),
)?;
let log_ck = client.log(&safe_ratio_ck)?;
let p_ij = client.div(&contingency, &n_t)?;
let terms_ck = client.mul(&p_ij, &log_ck)?;
let masked_ck = client.where_cond(&mask, &terms_ck, &zero)?;
let h_c_given_k = client.mul_scalar(&client.sum(&masked_ck, &[0, 1], false)?, -1.0)?;
let a_col = a.unsqueeze(1)?;
let a_safe = client.maximum(
&a_col,
&Tensor::<R>::full_scalar(&[1], dtype, 1e-15, device),
)?;
let ratio_kc = client.div(&contingency, &a_safe)?;
let safe_ratio_kc = client.maximum(
&ratio_kc,
&Tensor::<R>::full_scalar(&[1], dtype, 1e-15, device),
)?;
let log_kc = client.log(&safe_ratio_kc)?;
let terms_kc = client.mul(&p_ij, &log_kc)?;
let masked_kc = client.where_cond(&mask, &terms_kc, &zero)?;
let h_k_given_c = client.mul_scalar(&client.sum(&masked_kc, &[0, 1], false)?, -1.0)?;
let h_c_safe = client.maximum(&h_c, &Tensor::<R>::full_scalar(&[], dtype, 1e-15, device))?;
let homogeneity = client.sub_scalar(&client.div(&h_c_given_k, &h_c_safe)?, 1.0)?;
let homogeneity = client.mul_scalar(&homogeneity, -1.0)?;
let h_k_safe = client.maximum(&h_k, &Tensor::<R>::full_scalar(&[], dtype, 1e-15, device))?;
let completeness = client.sub_scalar(&client.div(&h_k_given_c, &h_k_safe)?, 1.0)?;
let completeness = client.mul_scalar(&completeness, -1.0)?;
let hc_sum = client.add(&homogeneity, &completeness)?;
let hc_safe = client.maximum(
&hc_sum,
&Tensor::<R>::full_scalar(&[], dtype, 1e-15, device),
)?;
let hc_prod = client.mul(&homogeneity, &completeness)?;
let v_measure = client.div(&client.mul_scalar(&hc_prod, 2.0)?, &hc_safe)?;
Ok(HCVScore {
homogeneity,
completeness,
v_measure,
})
}