use std::boxed::Box;
use std::result::Result;
use num::{traits::FloatConst, Float};
use rand::{
distributions::{uniform::SampleUniform, Uniform},
rngs::ThreadRng,
seq::{IteratorRandom, SliceRandom},
Rng,
};
use rand_distr::{Distribution, StandardNormal};
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};
pub use crate::error::Error;
mod error;
#[cfg(feature = "serde")]
mod serde_array;
#[cfg(not(feature = "serde"))]
pub trait ForestFloat<'de>: Float {}
#[cfg(feature = "serde")]
pub trait ForestFloat<'de>: Float + Serialize + Deserialize<'de> {}
impl<'de> ForestFloat<'de> for f32 {}
impl<'de> ForestFloat<'de> for f64 {}
pub struct ForestOptions {
pub n_trees: usize,
pub sample_size: usize,
pub max_tree_depth: Option<usize>,
pub extension_level: usize,
}
impl Default for ForestOptions {
fn default() -> Self {
Self {
n_trees: 20,
sample_size: 20,
max_tree_depth: None,
extension_level: 0,
}
}
}
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub struct Forest<T, const N: usize> {
avg_path_length_c: f64,
trees: Vec<Tree<T, N>>,
}
impl<'de, T, const N: usize> Forest<T, N>
where
T: ForestFloat<'de> + SampleUniform + Default,
StandardNormal: Distribution<T>,
{
pub fn from_slice(training_data: &[[T; N]], options: &ForestOptions) -> Result<Self, Error> {
if training_data.len() < options.sample_size || N == 0 {
return Err(Error::InsufficientTrainingData);
} else if options.extension_level > (N - 1) {
return Err(Error::ExtensionLevelExceedsDimensions);
}
let max_tree_depth = if let Some(mdt) = options.max_tree_depth {
mdt
} else {
(options.sample_size as f64).log2().ceil() as usize
};
let rng = &mut rand::thread_rng();
let trees = (0..options.n_trees)
.map(|_| {
let tree_sample: Vec<_> = training_data
.choose_multiple(rng, options.sample_size)
.into_iter()
.collect();
Tree::new(
tree_sample.as_slice(),
rng,
max_tree_depth,
options.extension_level,
)
})
.collect();
Ok(Self {
avg_path_length_c: c_factor(options.sample_size),
trees,
})
}
pub fn score(&self, values: &[T; N]) -> f64 {
let path_length: f64 = self.trees.iter().map(|tree| tree.path_length(values)).sum();
let eh = path_length / self.trees.len() as f64;
2.0_f64.powf(-eh / self.avg_path_length_c)
}
}
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
enum Node<T, const N: usize> {
Ex(ExNode),
In(InNode<T, N>),
}
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
struct InNode<T, const N: usize> {
left: Box<Node<T, N>>,
right: Box<Node<T, N>>,
#[cfg_attr(feature = "serde", serde(with = "serde_array"))]
n: [T; N],
#[cfg_attr(feature = "serde", serde(with = "serde_array"))]
p: [T; N],
}
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
struct ExNode {
num_samples: usize,
}
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
struct Tree<T, const N: usize> {
root: Node<T, N>,
}
impl<'de, T, const N: usize> Tree<T, N>
where
T: ForestFloat<'de> + SampleUniform + Default,
StandardNormal: Distribution<T>,
{
pub fn new(
samples: &[&[T; N]],
rng: &mut ThreadRng,
max_tree_depth: usize,
extension_level: usize,
) -> Self {
Self {
root: Self::make_tree(samples, rng, 0, max_tree_depth, extension_level),
}
}
fn make_tree(
samples: &[&[T; N]],
rng: &mut ThreadRng,
current_tree_depth: usize,
max_tree_depth: usize,
extension_level: usize,
) -> Node<T, N> {
let num_samples = samples.len();
if current_tree_depth >= max_tree_depth || num_samples <= 1 {
Node::Ex(ExNode { num_samples })
} else {
let p = {
let mut maxs = *samples[0];
let mut mins = *samples[0];
samples.iter().skip(1).for_each(|s| {
s.iter().enumerate().for_each(|(i, v)| {
maxs[i] = if *v > maxs[i] { *v } else { maxs[i] };
mins[i] = if *v < mins[i] { *v } else { mins[i] };
})
});
let mut p = [T::zero(); N];
mins.iter().zip(maxs.iter()).zip(p.iter_mut()).for_each(
|((min_val, max_val), p_i)| *p_i = rng.sample(Uniform::new(*min_val, *max_val)),
);
p
};
let mut n = [T::zero(); N];
(0..N)
.zip(n.iter_mut())
.for_each(|(_, n_i)| *n_i = rng.sample(StandardNormal));
for idx in (0..N).choose_multiple(rng, N - extension_level - 1) {
n[idx] = T::zero();
}
let mut samples_left = vec![];
let mut samples_right = vec![];
for sample in samples {
match determinate_direction(*sample, &n, &p) {
Direction::Left => samples_left.push(*sample),
Direction::Right => samples_right.push(*sample),
}
}
Node::In(InNode {
left: Box::new(Self::make_tree(
samples_left.as_slice(),
rng,
current_tree_depth + 1,
max_tree_depth,
extension_level,
)),
right: Box::new(Self::make_tree(
samples_right.as_slice(),
rng,
current_tree_depth + 1,
max_tree_depth,
extension_level,
)),
n,
p,
})
}
}
pub fn path_length(&self, values: &[T; N]) -> f64 {
self.path_length_recurse(&self.root, values)
}
fn path_length_recurse(&self, node: &Node<T, N>, values: &[T; N]) -> f64 {
match node {
Node::Ex(ex_node) => {
if ex_node.num_samples <= 1 {
0.0
} else {
c_factor(ex_node.num_samples)
}
}
Node::In(in_node) => {
1.0 + self.path_length_recurse(
match determinate_direction(values, &in_node.n, &in_node.p) {
Direction::Left => in_node.left.as_ref(),
Direction::Right => in_node.right.as_ref(),
},
values,
)
}
}
}
}
fn c_factor(n: usize) -> f64 {
2.0 * ((n as f64 - 1.0).log(f64::E()) + 0.5772156649) - (2.0 * (n as f64 - 1.0) / n as f64)
}
enum Direction {
Left,
Right,
}
fn determinate_direction<T, const N: usize>(sample: &[T; N], n: &[T; N], p: &[T; N]) -> Direction
where
T: Float,
{
let direction_value = sample
.iter()
.zip(p.iter())
.map(|(sample_val, p_val)| *sample_val - *p_val)
.zip(n.iter())
.fold(T::zero(), |sum, (sp_val, n_val)| sum + sp_val * (*n_val));
if direction_value <= T::zero() {
Direction::Left
} else {
Direction::Right
}
}
#[cfg(test)]
mod tests {
use rand::distributions::Uniform;
use rand::Rng;
use crate::{Forest, ForestOptions};
fn make_f64_forest() -> Forest<f64, 3> {
let rng = &mut rand::thread_rng();
let distribution = Uniform::new(-4., 4.);
let distribution2 = Uniform::new(10., 50.);
let values: Vec<_> = (0..6000)
.map(|_| {
[
rng.sample(distribution),
rng.sample(distribution),
rng.sample(distribution2),
]
})
.collect();
let options = ForestOptions {
n_trees: 150,
sample_size: 200,
max_tree_depth: None,
extension_level: 1,
};
Forest::from_slice(values.as_slice(), &options).unwrap()
}
fn assert_anomalies_forest_3d_f64(forest: &Forest<f64, 3>) {
assert!(forest.score(&[1.0, 3.0, 25.0]) < 0.5);
assert!(forest.score(&[-1.0, 3.0, 25.0]) < 0.5);
assert!(forest.score(&[-12.0, 6.0, 25.0]) > 0.5);
assert!(forest.score(&[-1.0, 2.0, 60.0]) > 0.5);
assert!(forest.score(&[-1.0, 2.0, 0.0]) > 0.5);
}
#[test]
fn score_forest_3d_f64() {
let forest = make_f64_forest();
assert_anomalies_forest_3d_f64(&forest);
}
#[cfg(feature = "serde")]
#[test]
fn serialize_forest_3d_f64() {
let forest = make_f64_forest();
let forest_json = serde_json::to_string(&forest).unwrap();
let forest2 = serde_json::from_str(forest_json.as_str()).unwrap();
assert_anomalies_forest_3d_f64(&forest2);
}
}