use multiversion::multiversion;
use rcflib::{
common::{
directionaldensity::InterpolationMeasure, divector::DiVector, rangevector::RangeVector,
},
errors::RCFError,
rcf::{AugmentedRCF, RCFBuilder, RCFLarge, RCFOptionsBuilder},
};
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct RandomCutForestOptions {
pub dimensions: usize,
pub shingle_size: usize,
pub id: Option<u64>,
pub num_trees: Option<usize>,
pub sample_size: Option<usize>,
pub output_after: Option<usize>,
pub random_seed: Option<u64>,
pub parallel_execution_enabled: Option<bool>,
pub lambda: Option<f64>,
pub internal_rotation: Option<bool>,
pub internal_shingling: Option<bool>,
pub propagate_attribute_vectors: Option<bool>,
pub store_pointsum: Option<bool>,
pub store_attributes: Option<bool>,
pub initial_accept_fraction: Option<f64>,
pub bounding_box_cache_fraction: Option<f64>,
}
impl Default for RandomCutForestOptions {
fn default() -> Self {
Self {
dimensions: 1,
shingle_size: 1,
id: None,
num_trees: None,
sample_size: None,
output_after: None,
random_seed: None,
parallel_execution_enabled: None,
lambda: None,
internal_rotation: None,
internal_shingling: None,
propagate_attribute_vectors: None,
store_pointsum: None,
store_attributes: None,
initial_accept_fraction: None,
bounding_box_cache_fraction: None,
}
}
}
impl Into<RCFBuilder> for RandomCutForestOptions {
fn into(self) -> RCFBuilder {
self.to_rcf_builder()
}
}
impl RandomCutForestOptions {
pub fn to_rcf_builder(&self) -> RCFBuilder {
let mut options = RCFBuilder::new(self.dimensions, self.shingle_size);
macro_rules! set_option {
($opt:expr, $method:ident) => {
if let Some(val) = $opt {
options.$method(val);
}
};
}
set_option!(self.id, id);
set_option!(self.num_trees, number_of_trees);
set_option!(self.sample_size, tree_capacity);
set_option!(self.output_after, output_after);
set_option!(self.random_seed, random_seed);
set_option!(self.parallel_execution_enabled, parallel_enabled);
set_option!(self.lambda, time_decay);
set_option!(self.internal_rotation, internal_rotation);
set_option!(self.internal_shingling, internal_shingling);
set_option!(
self.propagate_attribute_vectors,
propagate_attribute_vectors
);
set_option!(self.store_pointsum, store_pointsum);
set_option!(self.store_attributes, store_attributes);
set_option!(self.initial_accept_fraction, initial_accept_fraction);
set_option!(
self.bounding_box_cache_fraction,
bounding_box_cache_fraction
);
options
}
pub fn to_rcf(&self) -> Result<RCFLarge<u64, u64>, RCFError> {
self.to_rcf_builder().build_large_simple()
}
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct RandomCutForest(RCFLarge<u64, u64>);
#[multiversion(targets = "simd")]
fn shingled_point(rcf: &RandomCutForest, point: &[f32]) -> Result<Vec<f32>, RCFError> {
rcf.0.shingled_point(point)
}
#[multiversion(targets = "simd")]
fn update(rcf: &mut RandomCutForest, point: &[f32]) -> Result<(), RCFError> {
rcf.0.update(point, 0)
}
#[multiversion(targets = "simd")]
fn score(rcf: &RandomCutForest, point: &[f32]) -> Result<f64, RCFError> {
rcf.0.score(point)
}
#[multiversion(targets = "simd")]
fn displacement_score(rcf: &RandomCutForest, point: &[f32]) -> Result<f64, RCFError> {
rcf.0.displacement_score(point)
}
#[multiversion(targets = "simd")]
fn attribution(rcf: &RandomCutForest, point: &[f32]) -> Result<DiVector, RCFError> {
rcf.0.attribution(point)
}
#[multiversion(targets = "simd")]
fn near_neighbor_list(
rcf: &RandomCutForest,
point: &[f32],
percentile: usize,
) -> Result<Vec<(f64, Vec<f32>, f64)>, RCFError> {
rcf.0.near_neighbor_list(point, percentile)
}
#[multiversion(targets = "simd")]
fn density(rcf: &RandomCutForest, point: &[f32]) -> Result<f64, RCFError> {
rcf.0.density(point)
}
#[multiversion(targets = "simd")]
fn directional_density(rcf: &RandomCutForest, point: &[f32]) -> Result<DiVector, RCFError> {
rcf.0.directional_density(point)
}
#[multiversion(targets = "simd")]
fn density_interpolant(
rcf: &RandomCutForest,
point: &[f32],
) -> Result<InterpolationMeasure, RCFError> {
rcf.0.density_interpolant(point)
}
#[multiversion(targets = "simd")]
pub fn extrapolate(rcf: &RandomCutForest, look_ahead: usize) -> Result<RangeVector<f32>, RCFError> {
rcf.0.extrapolate(look_ahead)
}
impl RandomCutForest {
pub fn new(options: RandomCutForestOptions) -> Result<Self, RCFError> {
let rcf = options.to_rcf()?;
Ok(Self(rcf))
}
pub fn shingled_point(&self, point: &[f32]) -> Result<Vec<f32>, RCFError> {
shingled_point(self, point)
}
pub fn update(&mut self, point: &[f32]) -> Result<(), RCFError> {
update(self, point)
}
pub fn score(&self, point: &[f32]) -> Result<f64, RCFError> {
score(self, point)
}
pub fn displacement_score(&self, point: &[f32]) -> Result<f64, RCFError> {
displacement_score(self, point)
}
pub fn attribution(&self, point: &[f32]) -> Result<DiVector, RCFError> {
attribution(self, point)
}
pub fn near_neighbor_list(
&self,
point: &[f32],
percentile: usize,
) -> Result<Vec<(f64, Vec<f32>, f64)>, RCFError> {
near_neighbor_list(self, point, percentile)
}
pub fn density(&self, point: &[f32]) -> Result<f64, RCFError> {
density(self, point)
}
pub fn directional_density(&self, point: &[f32]) -> Result<DiVector, RCFError> {
directional_density(self, point)
}
pub fn density_interpolant(&self, point: &[f32]) -> Result<InterpolationMeasure, RCFError> {
density_interpolant(self, point)
}
pub fn extrapolate(&self, look_ahead: usize) -> Result<RangeVector<f32>, RCFError> {
extrapolate(self, look_ahead)
}
pub fn dimensions(&self) -> usize {
self.0.dimensions()
}
pub fn shingle_size(&self) -> usize {
self.0.shingle_size()
}
pub fn is_internal_shingling_enabled(&self) -> bool {
self.0.is_internal_shingling_enabled()
}
pub fn is_output_ready(&self) -> bool {
self.0.is_output_ready()
}
pub fn entries_seen(&self) -> u64 {
self.0.entries_seen()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_default_rcfoptions() {
let opts = RandomCutForestOptions::default();
assert_eq!(opts.dimensions, 1);
assert_eq!(opts.shingle_size, 1);
assert!(opts.num_trees.is_none());
assert!(opts.sample_size.is_none());
assert!(opts.output_after.is_none());
assert!(opts.random_seed.is_none());
assert!(opts.parallel_execution_enabled.is_none());
assert!(opts.lambda.is_none());
}
#[test]
fn test_to_rcf_builder() {
let opts = RandomCutForestOptions {
dimensions: 3,
shingle_size: 2,
num_trees: Some(50),
sample_size: Some(128),
output_after: Some(10),
random_seed: Some(42),
parallel_execution_enabled: Some(true),
lambda: Some(0.01),
..Default::default()
};
let builder = opts.to_rcf_builder();
let rcf = builder.build::<u64, u64>();
assert!(rcf.is_ok());
let rcf = builder.build_large_simple::<u64>();
assert!(rcf.is_ok());
}
#[test]
fn test_to_rcf_returns_ok() {
let opts = RandomCutForestOptions {
dimensions: 2,
shingle_size: 1,
num_trees: Some(10),
sample_size: Some(32),
..Default::default()
};
let rcf = opts.to_rcf();
assert!(rcf.is_ok());
}
#[test]
fn test_random_cut_forest_creation() {
let opts = RandomCutForestOptions {
dimensions: 2,
shingle_size: 2,
random_seed: Some(42),
..Default::default()
};
let rcf = RandomCutForest::new(opts).unwrap();
assert_eq!(rcf.dimensions(), 4);
assert_eq!(rcf.shingle_size(), 2);
assert!(rcf.is_internal_shingling_enabled());
assert!(!rcf.is_output_ready());
assert_eq!(rcf.entries_seen(), 0);
}
#[test]
fn test_random_cut_forest_update_and_score() {
let opts = RandomCutForestOptions {
dimensions: 2,
shingle_size: 1,
num_trees: Some(10),
sample_size: Some(32),
..Default::default()
};
let mut rcf = RandomCutForest::new(opts).unwrap();
let point = vec![1.0, 2.0];
rcf.update(&point).unwrap();
let score = rcf.score(&point).unwrap();
assert!(score.is_finite());
assert!(score < 2.0);
let displacement_score = rcf.displacement_score(&point).unwrap();
assert!(displacement_score.is_finite());
let attribution = rcf.attribution(&point).unwrap();
assert_eq!(attribution.high.len(), 2);
assert_eq!(attribution.low.len(), 2);
let neighbors = rcf.near_neighbor_list(&point, 10).unwrap();
assert!(!neighbors.is_empty());
for (dist, neighbor, _) in neighbors {
assert!(dist.is_finite());
assert_eq!(neighbor.len(), 2);
assert!(neighbor.iter().all(|&x| x.is_finite()));
}
}
#[test]
fn test_rcf_serde_json() {
let opts = RandomCutForestOptions {
dimensions: 3,
random_seed: Some(42),
..Default::default()
};
let mut rcf = RandomCutForest::new(opts).unwrap();
let serialized = serde_json::to_string(&rcf).unwrap();
let mut deserialized: RandomCutForest = serde_json::from_str(&serialized).unwrap();
let point = vec![1.0, 2.0, 3.0];
rcf.update(&point).unwrap();
let score1 = rcf.score(&point).unwrap();
deserialized.update(&point).unwrap();
let score2 = deserialized.score(&point).unwrap();
assert_eq!(score1, score2);
}
#[test]
fn test_rcf_update_invalid_dimension() {
let opts = RandomCutForestOptions {
dimensions: 2,
..Default::default()
};
let mut rcf = RandomCutForest::new(opts).unwrap();
let point = vec![1.0, 2.0, 3.0];
let result = rcf.update(&point);
assert!(result.is_err());
}
#[test]
fn test_rcf_score_before_update() {
let opts = RandomCutForestOptions {
dimensions: 2,
..Default::default()
};
let rcf = RandomCutForest::new(opts).unwrap();
let point = vec![0.0, 0.0];
let score = rcf.score(&point).unwrap();
assert!(score.is_finite());
}
#[test]
fn test_rcf_entries_seen_and_output_ready() {
let opts = RandomCutForestOptions {
dimensions: 2,
shingle_size: 1,
num_trees: Some(5),
sample_size: Some(10),
output_after: Some(2),
..Default::default()
};
let mut rcf = RandomCutForest::new(opts).unwrap();
let point = vec![1.0, 2.0];
assert_eq!(rcf.entries_seen(), 0);
assert!(!rcf.is_output_ready());
rcf.update(&point).unwrap();
assert_eq!(rcf.entries_seen(), 1);
rcf.update(&point).unwrap();
assert_eq!(rcf.entries_seen(), 2);
rcf.update(&point).unwrap();
assert!(rcf.is_output_ready());
}
#[test]
fn test_rcf_shingled_point_and_internal_shingling() {
let dim = 2;
let shingle_size = 2;
let opts = RandomCutForestOptions {
dimensions: dim,
shingle_size: shingle_size,
random_seed: Some(42),
..Default::default()
};
let rcf = RandomCutForest::new(opts).unwrap();
assert_eq!(rcf.shingle_size(), shingle_size);
assert!(rcf.is_internal_shingling_enabled());
let point = vec![1.0, 2.0];
let shingled = rcf.shingled_point(&point);
assert!(shingled.is_ok());
let shingled_vec = shingled.unwrap();
assert_eq!(shingled_vec.len(), dim * shingle_size);
}
#[test]
fn test_rcf_density_and_directional_density() {
let dim = 3;
let shingle_size = 2;
let opts = RandomCutForestOptions {
dimensions: dim,
shingle_size: shingle_size,
random_seed: Some(42),
..Default::default()
};
let mut rcf = RandomCutForest::new(opts).unwrap();
let point = vec![1.0; dim];
rcf.update(&point).unwrap();
rcf.update(&point).unwrap();
rcf.update(&point).unwrap();
let density = rcf.density(&point).unwrap();
assert!(density.is_finite());
let dir_density = rcf.directional_density(&point).unwrap();
assert_eq!(dir_density.high.len(), dim * shingle_size);
assert_eq!(dir_density.low.len(), dim * shingle_size);
}
#[test]
fn test_rcf_density_interpolant_and_extrapolate() {
let opts = RandomCutForestOptions {
dimensions: 2,
shingle_size: 2,
random_seed: Some(42),
..Default::default()
};
let mut rcf = RandomCutForest::new(opts).unwrap();
let point = vec![1.0, 2.0];
rcf.update(&point).unwrap();
rcf.update(&point).unwrap();
rcf.update(&point).unwrap();
let interpolant = rcf.density_interpolant(&point).unwrap();
let _ = interpolant.measure;
let _ = interpolant.distance;
let _ = interpolant.probability_mass;
let _ = interpolant.sample_size;
let extrapolated = rcf.extrapolate(1).unwrap();
assert_eq!(extrapolated.values.len(), 2);
}
}