use ahash::AHashMap;
use ndarray::{ArrayBase, Data, Ix1};
pub fn mean_squared_error<S>(y_true: &ArrayBase<S, Ix1>, y_pred: &ArrayBase<S, Ix1>) -> f64
where
S: Data<Elem = f64>,
{
if y_true.len() != y_pred.len() {
panic!(
"Input arrays must have the same length. Predicted: {}, Actual: {}",
y_true.len(),
y_pred.len()
);
}
let n = y_true.len();
if n == 0 {
return 0.0;
}
let sum_squared_diff = y_true
.iter()
.zip(y_pred.iter())
.fold(0.0, |acc, (&actual, &pred)| {
let error = actual - pred;
acc + error * error
});
sum_squared_diff / (n as f64)
}
pub fn root_mean_squared_error<S>(
predictions: &ArrayBase<S, Ix1>,
targets: &ArrayBase<S, Ix1>,
) -> f64
where
S: Data<Elem = f64>,
{
if predictions.is_empty() || targets.is_empty() {
return 0.0;
}
if predictions.len() != targets.len() {
panic!(
"Prediction and target arrays must have the same length. Predicted: {}, Actual: {}",
predictions.len(),
targets.len()
);
}
let sum_squared_errors =
predictions
.iter()
.zip(targets.iter())
.fold(0.0, |acc, (&pred, &target)| {
let error = pred - target;
acc + error * error
});
let mse = sum_squared_errors / predictions.len() as f64;
if mse < 0.0 && mse > -f64::EPSILON {
0.0
} else {
mse.sqrt()
}
}
pub fn mean_absolute_error<S>(predictions: &ArrayBase<S, Ix1>, targets: &ArrayBase<S, Ix1>) -> f64
where
S: Data<Elem = f64>,
{
if predictions.is_empty() || targets.is_empty() {
return 0.0;
}
if predictions.len() != targets.len() {
panic!(
"Prediction and target arrays must have the same length. Predicted: {}, Actual: {}",
predictions.len(),
targets.len()
);
}
let sum_absolute_errors = predictions
.iter()
.zip(targets.iter())
.fold(0.0, |acc, (&pred, &target)| acc + (pred - target).abs());
let mae = sum_absolute_errors / predictions.len() as f64;
mae
}
pub fn r2_score<S>(predicted: &ArrayBase<S, Ix1>, actual: &ArrayBase<S, Ix1>) -> f64
where
S: Data<Elem = f64>,
{
if predicted.is_empty() || actual.is_empty() {
return 0.0;
}
if predicted.len() != actual.len() {
panic!(
"Prediction and target arrays must have the same length. Predicted: {}, Actual: {}",
predicted.len(),
actual.len()
);
}
let actual_mean = actual.mean().unwrap();
let (sse, sst) = actual.iter().zip(predicted.iter()).fold(
(0.0, 0.0),
|(sse_acc, sst_acc), (&act, &pred)| {
let error = pred - act;
let deviation = act - actual_mean;
(
sse_acc + error * error, sst_acc + deviation * deviation, )
},
);
if sst < 1e-10 {
return 0.0;
}
1.0 - (sse / sst)
}
pub struct ConfusionMatrix {
tp: usize,
fp: usize,
tn: usize,
fn_: usize,
}
impl ConfusionMatrix {
pub fn new<S>(predicted: &ArrayBase<S, Ix1>, actual: &ArrayBase<S, Ix1>) -> Self
where
S: Data<Elem = f64>,
{
if predicted.len() != actual.len() {
panic!(
"Input arrays must have the same length. Predicted: {}, Actual: {}",
predicted.len(),
actual.len()
);
}
if predicted.is_empty() {
panic!("Input arrays must not be empty");
}
let mut tp = 0;
let mut fp = 0;
let mut tn = 0;
let mut fn_ = 0;
for (p, a) in predicted.iter().zip(actual.iter()) {
let p_binary = if *p >= 0.5 { 1.0 } else { 0.0 };
let a_binary = if *a >= 0.5 { 1.0 } else { 0.0 };
match (p_binary, a_binary) {
(1.0, 1.0) => tp += 1,
(1.0, 0.0) => fp += 1,
(0.0, 1.0) => fn_ += 1,
(0.0, 0.0) => tn += 1,
_ => unreachable!(), }
}
Self { tp, fp, tn, fn_ }
}
pub fn get_counts(&self) -> (usize, usize, usize, usize) {
(self.tp, self.fp, self.tn, self.fn_)
}
pub fn accuracy(&self) -> f64 {
let total = self.tp + self.tn + self.fp + self.fn_;
if total == 0 {
return 0.0;
}
(self.tp + self.tn) as f64 / total as f64
}
pub fn error_rate(&self) -> f64 {
1.0 - self.accuracy()
}
pub fn precision(&self) -> f64 {
if self.tp + self.fp == 0 {
return 0.0;
}
self.tp as f64 / (self.tp + self.fp) as f64
}
pub fn recall(&self) -> f64 {
if self.tp + self.fn_ == 0 {
return 1.0;
}
self.tp as f64 / (self.tp + self.fn_) as f64
}
pub fn specificity(&self) -> f64 {
if self.tn + self.fp == 0 {
return 1.0;
}
self.tn as f64 / (self.tn + self.fp) as f64
}
pub fn f1_score(&self) -> f64 {
let precision = self.precision();
let recall = self.recall();
if precision + recall == 0.0 {
return 0.0;
}
2.0 * (precision * recall) / (precision + recall)
}
pub fn summary(&self) -> String {
format!(
"Confusion Matrix:\n\
| | Predicted Positive | Predicted Negative |\n\
|----------------|-------------------|--------------------|\n\
| Actual Positive | TP: {} | FN: {} |\n\
| Actual Negative | FP: {} | TN: {} |\n\
\n\
Performance Metrics:\n\
- Accuracy: {:.4}\n\
- Error Rate: {:.4}\n\
- Precision: {:.4}\n\
- Recall: {:.4}\n\
- Specificity: {:.4}\n\
- F1 Score: {:.4}",
self.tp,
self.fn_,
self.fp,
self.tn,
self.accuracy(),
self.error_rate(),
self.precision(),
self.recall(),
self.specificity(),
self.f1_score()
)
}
}
pub fn accuracy<S>(predicted: &ArrayBase<S, Ix1>, actual: &ArrayBase<S, Ix1>) -> f64
where
S: Data<Elem = f64>,
{
if predicted.len() != actual.len() {
panic!(
"Input arrays must have the same length. Predicted: {}, Actual: {}",
predicted.len(),
actual.len()
);
}
if predicted.is_empty() || actual.is_empty() {
panic!("Input arrays must not be empty");
}
let correct_predictions = predicted
.iter()
.zip(actual.iter())
.filter(|&(p, a)| (p - a).abs() < f64::EPSILON)
.count();
correct_predictions as f64 / predicted.len() as f64
}
fn contingency_matrix(
labels_true: &[usize],
labels_pred: &[usize],
) -> (Vec<Vec<usize>>, Vec<usize>, Vec<usize>) {
let mut label_to_index_true = AHashMap::new();
let mut label_to_index_pred = AHashMap::new();
let mut current_index_true = 0;
let mut current_index_pred = 0;
for &label in labels_true {
label_to_index_true.entry(label).or_insert_with(|| {
let idx = current_index_true;
current_index_true += 1;
idx
});
}
for &label in labels_pred {
label_to_index_pred.entry(label).or_insert_with(|| {
let idx = current_index_pred;
current_index_pred += 1;
idx
});
}
let n_rows = label_to_index_true.len();
let n_cols = label_to_index_pred.len();
let mut matrix = vec![vec![0usize; n_cols]; n_rows];
for (&l_true, &l_pred) in labels_true.iter().zip(labels_pred.iter()) {
let i = label_to_index_true[&l_true];
let j = label_to_index_pred[&l_pred];
matrix[i][j] += 1;
}
let row_sums = matrix
.iter()
.map(|row| row.iter().sum())
.collect::<Vec<usize>>();
let col_sums = (0..n_cols)
.map(|j| matrix.iter().map(|row| row[j]).sum())
.collect::<Vec<usize>>();
(matrix, row_sums, col_sums)
}
fn mutual_information(
contingency: &Vec<Vec<usize>>,
n: usize,
row_sums: &Vec<usize>,
col_sums: &Vec<usize>,
) -> f64 {
let mut mi = 0.0;
for (i, row) in contingency.iter().enumerate() {
for (j, &n_ij) in row.iter().enumerate() {
if n_ij > 0 {
let n_ij_f = n_ij as f64;
let a = row_sums[i] as f64;
let b = col_sums[j] as f64;
mi += (n_ij_f / n as f64) * ((n as f64 * n_ij_f) / (a * b)).ln();
}
}
}
mi
}
fn entropy_nats(counts: &Vec<usize>, n: usize) -> f64 {
let mut h = 0.0;
for &count in counts {
if count > 0 {
let p = count as f64 / n as f64;
h -= p * p.ln();
}
}
h
}
fn log_binomial_coefficient(n: u64, k: u64) -> f64 {
if k > n {
return f64::NEG_INFINITY; }
if k == 0 || k == n {
return 0.0; }
let k = if k > n - k { n - k } else { k };
let mut result = 0.0;
for i in 0..k {
result += ((n - i) as f64).ln() - ((i + 1) as f64).ln();
}
result
}
fn hypergeometric_pmf(n_population: u64, n_successes: u64, n_draws: u64, k: u64) -> f64 {
if n_successes > n_population || n_draws > n_population {
return 0.0;
}
let n_failures = n_population - n_successes;
if k > n_successes || k > n_draws {
return 0.0;
}
if n_draws - k > n_failures {
return 0.0;
}
let log_prob = log_binomial_coefficient(n_successes, k)
+ log_binomial_coefficient(n_failures, n_draws - k)
- log_binomial_coefficient(n_population, n_draws);
log_prob.exp()
}
fn expected_mutual_information(row_sums: &Vec<usize>, col_sums: &Vec<usize>, n: usize) -> f64 {
let mut emi = 0.0;
for &a_i in row_sums {
for &b_j in col_sums {
if a_i > n || b_j > n {
continue;
}
let lower_bound = if a_i + b_j > n { a_i + b_j - n } else { 0 };
let lower = if lower_bound < 1 { 1 } else { lower_bound };
let upper = std::cmp::min(a_i, b_j);
for k in lower..=upper {
let p = hypergeometric_pmf(n as u64, a_i as u64, b_j as u64, k as u64);
let term = (k as f64 / n as f64)
* ((n as f64 * k as f64) / (a_i as f64 * b_j as f64)).ln();
emi += p * term;
}
}
}
emi
}
pub fn normalized_mutual_info<S>(
labels_true: &ArrayBase<S, Ix1>,
labels_pred: &ArrayBase<S, Ix1>,
) -> f64
where
S: Data<Elem = usize>,
{
if labels_true.len() != labels_pred.len() {
panic!(
"Input arrays must have the same length. Predicted: {}, Actual: {}",
labels_true.len(),
labels_pred.len()
);
}
if labels_true.is_empty() || labels_pred.is_empty() {
panic!("Input arrays cannot be empty");
}
let n = labels_true.len();
let labels_true_slice: &[usize] = labels_true.as_slice().unwrap();
let labels_pred_slice: &[usize] = labels_pred.as_slice().unwrap();
let (contingency, row_sums, col_sums) =
contingency_matrix(labels_true_slice, labels_pred_slice);
let mi = mutual_information(&contingency, n, &row_sums, &col_sums);
let h_true = entropy_nats(&row_sums, n);
let h_pred = entropy_nats(&col_sums, n);
if h_true * h_pred == 0.0 {
0.0
} else {
mi / (h_true * h_pred).sqrt()
}
}
pub fn adjusted_mutual_info<S>(
labels_true: &ArrayBase<S, Ix1>,
labels_pred: &ArrayBase<S, Ix1>,
) -> f64
where
S: Data<Elem = usize>,
{
if labels_true.len() != labels_pred.len() {
panic!(
"Input arrays must have the same length. Predicted: {}, Actual: {}",
labels_true.len(),
labels_pred.len()
);
}
if labels_true.is_empty() || labels_pred.is_empty() {
panic!("Input arrays cannot be empty");
}
let n = labels_true.len();
let labels_true_slice: &[usize] = labels_true.as_slice().unwrap();
let labels_pred_slice: &[usize] = labels_pred.as_slice().unwrap();
let (contingency, row_sums, col_sums) =
contingency_matrix(labels_true_slice, labels_pred_slice);
let mi = mutual_information(&contingency, n, &row_sums, &col_sums);
let h_true = entropy_nats(&row_sums, n);
let h_pred = entropy_nats(&col_sums, n);
let emi = expected_mutual_information(&row_sums, &col_sums, n);
let denominator = ((h_true + h_pred) / 2.0) - emi;
if denominator.abs() < 1e-10 {
1.0
} else {
(mi - emi) / denominator
}
}
pub fn calculate_auc<S1, S2>(scores: &ArrayBase<S1, Ix1>, labels: &ArrayBase<S2, Ix1>) -> f64
where
S1: Data<Elem = f64>,
S2: Data<Elem = bool>,
{
if scores.len() != labels.len() {
panic!(
"Input arrays must have the same length. Predicted: {}, Actual: {}",
scores.len(),
labels.len()
);
}
if scores.is_empty() || labels.is_empty() {
panic!("Input arrays cannot be empty");
}
let mut pairs: Vec<(f64, bool)> = scores
.iter()
.zip(labels.iter())
.map(|(s, &l)| (*s, l))
.collect();
pairs.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
let mut pos_count = 0;
let mut neg_count = 0;
let mut sum_positive_ranks = 0.0;
let n = pairs.len();
let mut i = 0;
let mut rank = 1.0;
while i < n {
let current_score = pairs[i].0;
let mut j = i;
while j < n && (pairs[j].0 - current_score).abs() < 1e-12 {
j += 1;
}
let count = (j - i) as f64;
let avg_rank = (2.0 * rank + count - 1.0) / 2.0;
for k in i..j {
if pairs[k].1 {
sum_positive_ranks += avg_rank;
pos_count += 1;
} else {
neg_count += 1;
}
}
rank += count;
i = j;
}
if pos_count == 0 || neg_count == 0 {
panic!("AUC cannot be calculated because there are no positive or negative samples");
}
let u = sum_positive_ranks - (pos_count as f64 * (pos_count as f64 + 1.0) / 2.0);
u / (pos_count as f64 * neg_count as f64)
}