rafor 0.3.0

Fast Random Forest library.
Documentation
use crate::{dt, prelude::*, rf};
use std::{fs::read_to_string, str::FromStr};

const MAX_THREADS: usize = 8;

/// The values for comparison are computed using scikit-learn using python_tests/tests.py script.
#[test]
fn classifier_tree_overfit_self() {
    let (samples, targets) = load_dataset::<i64>("datasets/winequality-red.csv", ";", true);
    let predictor = dt::Classifier::trainer()
        .with_max_depth(500)
        .train(&samples, &targets);

    let y_pred = predictor.predict(&samples);
    let acc = classifier_accuracy(&y_pred, &targets);
    assert!(acc == 1.0);
}

#[test]
fn classifier_tree_depth10() {
    let (samples, targets) = load_dataset::<i64>("datasets/winequality-red.csv", ";", true);
    let (x_train, y_train, x_pred, y_ref) = split_dataset(&samples, &targets);
    let predictor = dt::Classifier::trainer()
        .with_max_depth(10)
        .train(&x_train, &y_train);

    let y_pred = predictor.predict(&x_pred);
    let acc = classifier_accuracy(&y_pred, &y_ref);
    assert!(acc >= 0.61);
}

#[test]
fn random_forest_classifier_overfit_self() {
    let (samples, targets) = load_dataset::<i64>("datasets/winequality-red.csv", ";", true);
    let predictor = rf::Classifier::trainer()
        .with_max_depth(500)
        .with_trees(100)
        .train(&samples, &targets);

    let y_pred = predictor.predict(&samples, MAX_THREADS);
    let acc = classifier_accuracy(&y_pred, &targets);
    assert!(acc == 1.0);
}

#[test]
fn random_forest_classifier_depth10() {
    let (samples, targets) = load_dataset::<i64>("datasets/winequality-red.csv", ";", true);
    let (x_train, y_train, x_pred, y_ref) = split_dataset(&samples, &targets);
    let predictor = rf::Classifier::trainer()
        .with_max_depth(10)
        .with_trees(100)
        .train(&x_train, &y_train);

    let y_pred = predictor.predict(&x_pred, MAX_THREADS);
    let acc = classifier_accuracy(&y_pred, &y_ref);
    assert!(acc >= 0.67);
}

#[test]
fn random_forest_classifier_depth10_self() {
    let (samples, targets) = load_dataset::<i64>("datasets/winequality-red.csv", ";", true);
    let predictor = rf::Classifier::trainer()
        .with_max_depth(10)
        .with_trees(100)
        .train(&samples, &targets);

    let y_pred = predictor.predict(&samples, MAX_THREADS);
    let acc = classifier_accuracy(&y_pred, &targets);
    assert!(acc >= 0.93);
}

#[test]
fn random_forest_binary_classifier() {
    let (samples, targets) = load_dataset::<String>("datasets/magic04.data", ",", false);
    let targets: Vec<i64> = targets.iter().map(|t| (t == "h") as i64).collect();

    let (x_train, y_train, x_pred, y_ref) = split_dataset(&samples, &targets);
    let predictor = rf::Classifier::trainer()
        .with_max_depth(10)
        .with_trees(100)
        .train(&x_train, &y_train);

    let y_pred = predictor.predict(&x_pred, MAX_THREADS);
    let f1 = f1score(&y_pred, &y_ref);
    assert!(f1 >= 0.79);
}

fn f1score(pred: &[i64], target: &[i64]) -> f64 {
    let tp = pred
        .iter()
        .zip(target.iter())
        .filter(|&(p, t)| *p == 1 && *t == 1)
        .count();
    let fp = pred
        .iter()
        .zip(target.iter())
        .filter(|&(p, t)| *p == 1 && *t == 0)
        .count();
    let fnn = pred
        .iter()
        .zip(target.iter())
        .filter(|&(p, t)| *p == 0 && *t == 1)
        .count();
    let precision = tp as f64 / (tp as f64 + fp as f64);
    let recall = tp as f64 / (tp as f64 + fnn as f64);
    let f1 = 2. * precision * recall / (precision + recall);
    f1
}

#[test]
fn decision_tree_regressor_overfit_self() {
    let (samples, targets) = load_dataset::<f32>("datasets/Folds5x2_pp.csv", ",", true);
    let predictor = dt::Regressor::trainer()
        .with_max_depth(500)
        .train(&samples, &targets);

    let y_pred = predictor.predict(&samples);
    let mse = mean_squared_error(&y_pred, &targets);
    assert!(mse < 0.000001);
}

#[test]
fn decision_tree_regressor() {
    let (samples, targets) = load_dataset::<f32>("datasets/Folds5x2_pp.csv", ",", true);
    let (x_train, y_train, x_pred, y_ref) = split_dataset(&samples, &targets);
    let predictor = dt::Regressor::trainer()
        .with_max_depth(500)
        .train(&x_train, &y_train);

    let y_pred = predictor.predict(&x_pred);
    let mse = mean_squared_error(&y_pred, &y_ref);
    assert!(mse < 23.0);
}

#[test]
fn random_forest_regressor_overfit_self() {
    let (samples, targets) = load_dataset::<f32>("datasets/Folds5x2_pp.csv", ",", true);
    let predictor = rf::Regressor::trainer()
        .with_max_depth(1000)
        .with_trees(25)
        .train(&samples, &targets);

    let y_pred = predictor.predict(&samples, MAX_THREADS);
    let mse = mean_squared_error(&y_pred, &targets);
    assert!(mse < 1.7);
}

fn random_forest_regressor(max_depth: usize) -> f64 {
    let (samples, targets) = load_dataset::<f32>("datasets/Folds5x2_pp.csv", ",", true);
    let (x_train, y_train, x_pred, y_ref) = split_dataset(&samples, &targets);
    let predictor = rf::Regressor::trainer()
        .with_max_depth(max_depth)
        .train(&x_train, &y_train);

    let y_pred = predictor.predict(&x_pred, MAX_THREADS);
    mean_squared_error(&y_pred, &y_ref)
}

#[test]
fn random_forest_regressor_depth5() {
    let mse = random_forest_regressor(5);
    assert!(mse < 19.5);
}

#[test]
fn random_forest_regressor_depth10() {
    let mse = random_forest_regressor(10);
    assert!(mse < 15.0);
}

fn mean_squared_error(v: &[f32], u: &[f32]) -> f64 {
    assert!(v.len() == u.len());
    let mse = v
        .iter()
        .zip(u.iter())
        .map(|(&x, &y)| (x - y) as f64 * (x - y) as f64)
        .sum::<f64>()
        / v.len() as f64;
    mse
}

fn split_dataset<T: Copy>(x: &[f32], y: &[T]) -> (Vec<f32>, Vec<T>, Vec<f32>, Vec<T>) {
    assert!(x.len() % y.len() == 0);
    let features = x.len() / y.len();

    let x_train: Vec<f32> = x
        .chunks(features)
        .enumerate()
        .filter(|(i, _)| i % 5 > 0)
        .flat_map(|(_, chunk)| chunk.iter().cloned())
        .collect();

    let y_train: Vec<T> = y
        .iter()
        .enumerate()
        .filter(|(i, _)| i % 5 > 0)
        .map(|(_, v)| *v)
        .collect();

    let x_pred: Vec<f32> = x
        .chunks(features)
        .enumerate()
        .filter(|(i, _)| i % 5 == 0)
        .flat_map(|(_, chunk)| chunk.iter().cloned())
        .collect();

    let y_ref: Vec<T> = y
        .iter()
        .enumerate()
        .filter(|(i, _)| i % 5 == 0)
        .map(|(_, v)| *v)
        .collect();

    (x_train, y_train, x_pred, y_ref)
}

fn classifier_accuracy(v: &[i64], u: &[i64]) -> f64 {
    assert!(v.len() == u.len());
    v.iter().zip(u.iter()).filter(|(x, y)| x == y).count() as f64 / u.len() as f64
}

fn load_dataset<T>(fname: &str, delimeter: &str, skip_first_line: bool) -> (Vec<f32>, Vec<T>)
where
    T: FromStr,
    T::Err: std::fmt::Debug,
{
    let lines: Vec<String> = read_to_string(fname)
        .unwrap()
        .lines()
        .map(|s| s.to_string())
        .collect();
    let mut samples: Vec<f32> = Vec::new();
    let mut targets: Vec<T> = Vec::new();
    let mut tokens_per_line = 0;
    for line in lines.iter().skip(skip_first_line as usize) {
        let w: Vec<_> = line.split(delimeter).collect();
        if tokens_per_line == 0 {
            tokens_per_line = w.len();
        }
        assert!(
            tokens_per_line == w.len(),
            "Lines in dataset has different sizes."
        );

        samples.extend(
            w.iter()
                .take(tokens_per_line - 1)
                .map(|s| s.parse::<f32>().unwrap()),
        );
        targets.push(
            w.last()
                .unwrap()
                .parse::<T>()
                .expect("Couldn't parse target value"),
        );
    }
    (samples, targets)
}