use crate::error::{AnomalyError, AnomalyResult};
const EPSILON: f32 = 1e-10;
#[derive(Debug, Clone)]
pub struct HbosConfig {
pub n_bins: usize,
}
impl Default for HbosConfig {
fn default() -> Self {
Self { n_bins: 10 }
}
}
pub struct Hbos {
config: HbosConfig,
min_vals: Vec<f32>,
max_vals: Vec<f32>,
bin_widths: Vec<f32>,
densities: Vec<Vec<f32>>,
n_features: usize,
n_samples: usize,
}
impl Default for Hbos {
fn default() -> Self {
Self::new(HbosConfig::default())
}
}
impl Hbos {
#[must_use]
pub fn new(config: HbosConfig) -> Self {
Self {
config,
min_vals: Vec::new(),
max_vals: Vec::new(),
bin_widths: Vec::new(),
densities: Vec::new(),
n_features: 0,
n_samples: 0,
}
}
pub fn fit(&mut self, data: &[f32], n_samples: usize, n_features: usize) -> AnomalyResult<()> {
if n_samples == 0 {
return Err(AnomalyError::EmptyInput);
}
if n_features == 0 {
return Err(AnomalyError::InvalidFeatureCount { n: 0 });
}
if data.len() != n_samples * n_features {
return Err(AnomalyError::DimensionMismatch {
expected: n_samples * n_features,
got: data.len(),
});
}
if self.config.n_bins == 0 {
return Err(AnomalyError::Internal {
msg: "n_bins must be >= 1".into(),
});
}
let n_bins = self.config.n_bins;
let mut min_vals = vec![f32::INFINITY; n_features];
let mut max_vals = vec![f32::NEG_INFINITY; n_features];
for s in 0..n_samples {
for j in 0..n_features {
let v = data[s * n_features + j];
if v < min_vals[j] {
min_vals[j] = v;
}
if v > max_vals[j] {
max_vals[j] = v;
}
}
}
let mut bin_widths = vec![0.0_f32; n_features];
let mut densities: Vec<Vec<f32>> = Vec::with_capacity(n_features);
for j in 0..n_features {
let range = max_vals[j] - min_vals[j];
if range <= 0.0 {
bin_widths[j] = 1.0; densities.push(vec![1.0_f32]);
continue;
}
let w = range / n_bins as f32;
bin_widths[j] = w;
let mut counts = vec![0_u64; n_bins];
for s in 0..n_samples {
let v = data[s * n_features + j];
let bin = Self::bin_index(v, min_vals[j], w, n_bins);
counts[bin] += 1;
}
let denom = n_samples as f32 * w;
let dens: Vec<f32> = counts.iter().map(|&c| c as f32 / denom).collect();
densities.push(dens);
}
self.min_vals = min_vals;
self.max_vals = max_vals;
self.bin_widths = bin_widths;
self.densities = densities;
self.n_features = n_features;
self.n_samples = n_samples;
Ok(())
}
pub fn score(&self, x: &[f32]) -> AnomalyResult<f32> {
if self.n_samples == 0 {
return Err(AnomalyError::NotFitted);
}
if x.len() != self.n_features {
return Err(AnomalyError::FeatureCountMismatch {
expected: self.n_features,
got: x.len(),
});
}
let mut hbos = 0.0_f32;
for (j, &v) in x.iter().enumerate() {
let density = self.lookup_density(j, v);
hbos += -(density + EPSILON).ln();
}
Ok(hbos)
}
pub fn score_batch(&self, x: &[f32], n: usize) -> AnomalyResult<Vec<f32>> {
if self.n_samples == 0 {
return Err(AnomalyError::NotFitted);
}
if x.len() != n * self.n_features {
return Err(AnomalyError::DimensionMismatch {
expected: n * self.n_features,
got: x.len(),
});
}
let mut scores = Vec::with_capacity(n);
for i in 0..n {
let sample = &x[i * self.n_features..(i + 1) * self.n_features];
scores.push(self.score(sample)?);
}
Ok(scores)
}
#[inline]
fn lookup_density(&self, j: usize, v: f32) -> f32 {
let feature_dens = &self.densities[j];
if (self.max_vals[j] - self.min_vals[j]).abs() < 1e-8 {
if (v - self.min_vals[j]).abs() < 1e-8 {
return feature_dens[0];
}
return 0.0;
}
if v < self.min_vals[j] || v > self.max_vals[j] {
return 0.0;
}
let n_bins = feature_dens.len();
let bin = Self::bin_index(v, self.min_vals[j], self.bin_widths[j], n_bins);
feature_dens[bin]
}
#[inline]
fn bin_index(v: f32, min_val: f32, bin_width: f32, n_bins: usize) -> usize {
let raw = ((v - min_val) / bin_width).floor() as isize;
raw.max(0).min(n_bins as isize - 1) as usize
}
}
#[cfg(test)]
mod tests {
use super::*;
fn simple_1d() -> Vec<f32> {
(0..20).map(|i| i as f32 * 0.1).collect()
}
#[test]
fn fit_and_score_finite() {
let data = simple_1d();
let mut hbos = Hbos::default();
hbos.fit(&data, 20, 1)
.expect("fit should succeed on valid 1-D data");
let s = hbos
.score(&[0.9_f32])
.expect("score should succeed after fit");
assert!(s.is_finite(), "score should be finite, got {s}");
}
#[test]
fn unfitted_returns_not_fitted_error() {
let hbos = Hbos::default();
let err = hbos.score(&[0.5_f32]).unwrap_err();
assert!(
matches!(err, AnomalyError::NotFitted),
"expected NotFitted, got {err:?}"
);
}
#[test]
fn outlier_scores_higher_than_inlier() {
let mut data: Vec<f32> = (0..50).map(|i| 0.4 + i as f32 * 0.004).collect();
data.push(10.0_f32);
let train = &data[..50];
let mut hbos = Hbos::new(HbosConfig { n_bins: 10 });
hbos.fit(train, 50, 1)
.expect("fit should succeed on cluster training data");
let inlier_score = hbos.score(&[0.5_f32]).expect("inlier score should succeed");
let outlier_score = hbos
.score(&[10.0_f32])
.expect("outlier score should succeed");
assert!(
outlier_score > inlier_score,
"outlier_score={outlier_score} should exceed inlier_score={inlier_score}"
);
}
#[test]
fn empty_input_returns_error() {
let mut hbos = Hbos::default();
let err = hbos.fit(&[], 0, 1).unwrap_err();
assert!(
matches!(err, AnomalyError::EmptyInput),
"expected EmptyInput, got {err:?}"
);
}
#[test]
fn dimension_mismatch_returns_error() {
let mut hbos = Hbos::default();
let data: Vec<f32> = (0..10).map(|i| i as f32).collect();
let err = hbos.fit(&data, 10, 2).unwrap_err();
assert!(
matches!(err, AnomalyError::DimensionMismatch { .. }),
"expected DimensionMismatch, got {err:?}"
);
}
#[test]
fn score_batch_returns_correct_length() {
let data = simple_1d();
let mut hbos = Hbos::default();
hbos.fit(&data, 20, 1)
.expect("fit should succeed on valid 1-D data");
let queries: Vec<f32> = (0..5).map(|i| i as f32 * 0.3).collect();
let scores = hbos
.score_batch(&queries, 5)
.expect("batch score should succeed");
assert_eq!(scores.len(), 5, "batch output should have 5 scores");
assert!(
scores.iter().all(|s| s.is_finite()),
"all scores must be finite"
);
}
#[test]
fn constant_feature_no_panic() {
const CONSTANT_VALUE: f32 = 2.5;
let data = vec![CONSTANT_VALUE; 30];
let mut hbos = Hbos::default();
hbos.fit(&data, 30, 1)
.expect("fit should succeed on constant-feature data");
let s = hbos
.score(&[CONSTANT_VALUE])
.expect("score on constant feature should succeed");
assert!(
s.is_finite(),
"score on constant feature must be finite, got {s}"
);
}
#[test]
fn n_bins_one_all_inliers_same_score() {
let data: Vec<f32> = (0..20).map(|i| i as f32).collect();
let mut hbos = Hbos::new(HbosConfig { n_bins: 1 });
hbos.fit(&data, 20, 1)
.expect("fit should succeed with n_bins=1");
let s0 = hbos
.score(&[0.0_f32])
.expect("score at range start should succeed");
let s1 = hbos
.score(&[10.0_f32])
.expect("score at mid-range should succeed");
let s2 = hbos
.score(&[19.0_f32])
.expect("score at range end should succeed");
assert!(
(s0 - s1).abs() < 1e-5 && (s0 - s2).abs() < 1e-5,
"all inliers should share the same score: {s0}, {s1}, {s2}"
);
}
#[test]
fn zero_n_bins_returns_error() {
let mut hbos = Hbos::new(HbosConfig { n_bins: 0 });
let data: Vec<f32> = (0..10).map(|i| i as f32).collect();
let err = hbos.fit(&data, 10, 1).unwrap_err();
assert!(
matches!(err, AnomalyError::Internal { .. }),
"expected Internal error for n_bins=0, got {err:?}"
);
}
#[test]
fn feature_count_mismatch_on_score() {
let data = simple_1d();
let mut hbos = Hbos::default();
hbos.fit(&data, 20, 1)
.expect("fit should succeed on valid 1-D data");
let err = hbos.score(&[0.5_f32, 0.5_f32]).unwrap_err();
assert!(
matches!(
err,
AnomalyError::FeatureCountMismatch {
expected: 1,
got: 2
}
),
"expected FeatureCountMismatch, got {err:?}"
);
}
#[test]
fn multivariate_fit_and_score() {
let data: Vec<f32> = (0..30)
.flat_map(|i| {
let f0 = i as f32 / 30.0;
let f1 = 10.0 + i as f32 / 30.0;
[f0, f1]
})
.collect();
let mut hbos = Hbos::new(HbosConfig { n_bins: 5 });
hbos.fit(&data, 30, 2)
.expect("fit should succeed on 2-D data");
let s_in = hbos
.score(&[0.5_f32, 10.5_f32])
.expect("inlier score on 2-D data should succeed");
let s_out = hbos
.score(&[5.0_f32, 0.0_f32])
.expect("outlier score on 2-D data should succeed");
assert!(s_in.is_finite(), "inlier score must be finite");
assert!(s_out.is_finite(), "outlier score must be finite");
assert!(
s_out > s_in,
"out-of-range point should score higher: {s_out} vs {s_in}"
);
}
}