use std::cmp::{min, Ordering};
use std::f32;
use std::usize;
use prelude::*;
use multiclass::OneVsRestWrapper;
use utils::EncodableRng;
use rand;
use rand::{Rng, StdRng};
use rand::distributions::{IndependentSample, Range};
fn sample_without_replacement<T: Copy, R: Rng>(from: &mut [T],
to: &mut Vec<T>,
number: usize,
rng: &mut R) {
for num_sampled in 0..number {
let idx = Range::new(num_sampled, from.len()).ind_sample(rng);
let sampled = from[idx];
to.push(sampled);
from.swap(idx, num_sampled);
}
}
struct FeatureIndices {
num_used: usize,
candidate_indices: Vec<usize>,
}
impl FeatureIndices {
fn new(candidate_indices: Vec<usize>) -> FeatureIndices {
FeatureIndices {num_used: 0,
candidate_indices: candidate_indices}
}
fn sample_indices(&mut self, to: &mut Vec<usize>, number: usize, rng: &mut StdRng) {
to.clear();
let number = min(number, self.candidate_indices.len() - self.num_used);
sample_without_replacement(&mut self.candidate_indices[self.num_used..],
to,
number,
rng);
}
fn mark_as_used(&mut self, feature_idx: usize) {
let indices = &mut self.candidate_indices[self.num_used..];
indices.swap(0, feature_idx);
self.num_used += 1;
}
}
#[derive(Debug)]
struct FeatureValues {
xy_pairs: Vec<(f32, f32)>,
zero_count: usize,
zero_y: f32,
count: usize,
total_y: f32
}
impl FeatureValues {
fn with_capacity(capacity: usize) -> FeatureValues {
FeatureValues {
xy_pairs: Vec::with_capacity(capacity),
zero_count: 0,
zero_y: 0.0,
count: 0,
total_y: 0.0
}
}
fn push(&mut self, x: f32, y: f32) {
self.count += 1;
self.total_y += y;
if x == 0.0 {
if self.zero_count == 0 {
self.xy_pairs.push((x, 0.0))
}
self.zero_count += 1;
self.zero_y += y;
} else {
self.xy_pairs.push((x, y))
}
}
fn fill_remaining_zeros(&mut self, remaining_zero_count: usize, remaining_positives: f32) {
if self.zero_count == 0 {
self.xy_pairs.push((0.0, 0.0));
}
self.zero_count += remaining_zero_count;
self.zero_y += remaining_positives;
self.total_y += remaining_positives;
self.count += remaining_zero_count;
}
fn sort(&mut self) {
self.xy_pairs.sort_by(|a, b|
a.0.partial_cmp(&b.0)
.unwrap_or(Ordering::Equal));
}
fn value_bounds(&self) -> (f32, f32) {
match self.xy_pairs.len() {
0 => (0.0, 0.0),
_ => (self.xy_pairs[0].0,
self.xy_pairs[self.xy_pairs.len() - 1].0),
}
}
fn feature_type(&self) -> FeatureType {
match self.xy_pairs.len() {
1 => FeatureType::Constant,
2 => FeatureType::Binary,
_ => FeatureType::Continuous,
}
}
fn clear(&mut self) {
unsafe {
self.xy_pairs.set_len(0);
}
self.zero_count = 0;
self.zero_y = 0.0;
self.total_y = 0.0;
self.count = 0;
}
}
#[derive(Clone)]
#[derive(RustcEncodable, RustcDecodable)]
enum FeatureType {
Constant,
Binary,
Continuous
}
#[derive(RustcEncodable, RustcDecodable)]
#[derive(Clone)]
pub struct Hyperparameters {
dim: usize,
max_features: usize,
min_samples_split: usize,
max_depth: usize,
rng: EncodableRng
}
impl Hyperparameters {
pub fn new(dim: usize) -> Hyperparameters {
Hyperparameters {dim: dim,
max_features: (dim as f32).sqrt() as usize,
min_samples_split: 2,
max_depth: usize::MAX,
rng: EncodableRng::new()}
}
pub fn max_features(&mut self, max_features: usize) -> &mut Hyperparameters {
self.max_features = max_features;
self
}
pub fn min_samples_split(&mut self, min_samples_split: usize) -> &mut Hyperparameters {
self.min_samples_split = min_samples_split;
self
}
pub fn max_depth(&mut self, max_depth: usize) -> &mut Hyperparameters {
self.max_depth = max_depth;
self
}
pub fn rng(&mut self, rng: rand::StdRng) -> &mut Hyperparameters {
self.rng.rng = rng;
self
}
pub fn build(&self) -> DecisionTree {
DecisionTree {dim: self.dim,
max_features: self.max_features,
max_depth: self.max_depth,
min_samples_split: self.min_samples_split,
root: None,
feature_types: Vec::new(),
rng: self.rng.clone()}
}
pub fn one_vs_rest(&self) -> OneVsRestWrapper<DecisionTree> {
let base_model = self.build();
OneVsRestWrapper::new(base_model)
}
}
#[derive(RustcEncodable, RustcDecodable)]
#[derive(Clone)]
enum Node {
Interior {feature: usize,
threshold: f32,
children: Box<(Node, Node)>},
Leaf {probability: f32}
}
#[derive(RustcEncodable, RustcDecodable)]
#[derive(Clone)]
pub struct DecisionTree {
dim: usize,
max_features: usize,
max_depth: usize,
min_samples_split: usize,
root: Option<Node>,
feature_types: Vec<FeatureType>,
rng: EncodableRng
}
impl SupervisedModel<Array> for DecisionTree {
fn fit(&mut self, X: &Array, y: &Array) -> Result<(), &'static str> {
assert!(X.cols() == self.dim);
assert!(X.rows() == y.rows());
assert!(y.cols() == 1);
self.feature_types = DecisionTree::analyze_features(X);
let mut feature_values = FeatureValues::with_capacity(X.rows());
let mut feature_indices = FeatureIndices::new(self.get_nonconstant_feature_indices());
let mut candidate_features = Vec::with_capacity(self.max_features);
self.root = Some(self.build_tree(X,
y,
&mut (0..X.rows()).collect::<Vec<usize>>()[..],
&mut feature_indices,
&mut candidate_features,
0,
&mut feature_values));
Ok(())
}
fn decision_function(&self, x: &Array) -> Result<Array, &'static str> {
match self.root {
Some(ref node) => {
let mut data = Vec::with_capacity(x.rows());
for row_idx in 0..x.rows() {
data.push(self.query_tree(&node, x, row_idx));
}
Ok(Array::from(data))
},
None => Err("Tree must be built before predicting")
}
}
}
impl SupervisedModel<SparseColumnArray> for DecisionTree {
fn fit(&mut self, X: &SparseColumnArray, y: &Array) -> Result<(), &'static str> {
assert!(X.cols() == self.dim);
assert!(X.rows() == y.rows());
assert!(y.cols() == 1);
self.feature_types = DecisionTree::analyze_features_sparse(X);
let mut feature_values = FeatureValues::with_capacity(X.rows());
let mut feature_indices = FeatureIndices::new(self.get_nonconstant_feature_indices());
let mut candidate_features = Vec::with_capacity(self.max_features);
self.root = Some(self.build_tree_sparse(X,
y,
&mut (0..X.rows()).collect::<Vec<usize>>()[..],
&mut feature_indices,
&mut candidate_features,
0,
&mut feature_values));
Ok(())
}
fn decision_function(&self, x: &SparseColumnArray) -> Result<Array, &'static str> {
match self.root {
Some(ref node) => {
let mut data = Vec::with_capacity(x.rows());
for row_idx in 0..x.rows() {
data.push(self.query_tree_sparse(&node, x, row_idx));
}
Ok(Array::from(data))
},
None => Err("Tree must be built before predicting")
}
}
}
macro_rules! build_tree {
($fn_name:ident, $x_type:ty,
$get_values:ident, $split_indices:ident) => {
fn $fn_name(&mut self, X: &$x_type, y: &Array,
indices: &mut [usize],
feature_indices: &mut FeatureIndices,
candidate_features: &mut Vec<usize>,
depth: usize,
feature_values: &mut FeatureValues) -> Node {
let num_positives = DecisionTree::count_positives(y, indices);
let probability = num_positives as f32 / indices.len() as f32;
if probability == 0.0
|| probability == 1.0
|| depth > self.max_depth
|| indices.len() < self.min_samples_split {
return Node::Leaf{probability: probability};
}
for _ in 0..10 {
feature_indices.sample_indices(candidate_features,
self.max_features,
&mut self.rng.rng);
let mut best_feature_position = 0;
let mut best_feature_idx = 0;
let mut best_feature_threshold = 0.0;
let mut best_impurity = f32::INFINITY;
for (feature_position, &feature_idx) in candidate_features.iter().enumerate() {
DecisionTree::$get_values(X, y, num_positives,
feature_idx,
indices,
feature_values);
if let FeatureType::Constant = feature_values.feature_type() {
feature_indices.mark_as_used(feature_position);
continue
}
let (threshold, impurity) = DecisionTree::calculate_split(feature_values);
if impurity < best_impurity {
best_feature_position = feature_position;
best_feature_idx = feature_idx;
best_feature_threshold = threshold;
best_impurity = impurity;
}
}
let (left_indices, right_indices) = DecisionTree::$split_indices(X,
indices,
best_feature_idx,
best_feature_threshold);
if left_indices.len() > 0 && right_indices.len() > 0 {
match self.feature_types[best_feature_idx] {
FeatureType::Binary => {
feature_indices.mark_as_used(best_feature_position);
},
_ => {}
}
let num_used_features = feature_indices.num_used;
let left = self.$fn_name(X, y,
left_indices,
feature_indices,
candidate_features,
depth + 1,
feature_values);
feature_indices.num_used = num_used_features;
let right = self.$fn_name(X, y,
right_indices,
feature_indices,
candidate_features,
depth + 1,
feature_values);
return Node::Interior {feature: best_feature_idx,
threshold: best_feature_threshold,
children: Box::new((left,
right))}
}
}
Node::Leaf {probability: probability}
}
}
}
impl DecisionTree {
fn analyze_features(X: &Array) -> Vec<FeatureType> {
let mut features = Vec::with_capacity(X.cols());
for col in X.iter_columns() {
let mut values = col.iter().collect::<Vec<_>>();
values.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
values.dedup();
features.push(match values.len() {
1 => FeatureType::Constant,
2 => FeatureType::Binary,
_ => FeatureType::Continuous
});
}
features
}
fn analyze_features_sparse(X: &SparseColumnArray) -> Vec<FeatureType> {
let mut features = Vec::with_capacity(X.cols());
for col in X.iter_columns() {
let mut values = col.iter_nonzero()
.map(|(_, val)| val).collect::<Vec<_>>();
if values.len() < X.rows() {
values.push(0.0);
}
values.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
values.dedup();
features.push(match values.len() {
1 => FeatureType::Constant,
2 => FeatureType::Binary,
_ => FeatureType::Continuous
});
}
features
}
fn get_nonconstant_feature_indices(&self) -> Vec<usize> {
self.feature_types.iter().enumerate()
.filter(|&(_, feature_type)| match feature_type {
&FeatureType::Constant => false,
_ => true
})
.map(|(i, _)| i)
.collect::<Vec<_>>()
}
fn count_positives(y: &Array, indices: &mut [usize]) -> usize {
let mut count = 0;
let data = y.data();
for &row_idx in indices.iter() {
if data[row_idx] > 0.0 {
count += 1;
}
}
count
}
build_tree!(build_tree, Array,
get_values,
split_indices);
build_tree!(build_tree_sparse, SparseColumnArray,
get_values_sparse,
split_indices_sparse);
fn split_indices<'a>(X: &Array, indices: &'a mut [usize],
feature_idx: usize, threshold: f32) -> (&'a mut [usize], &'a mut [usize]) {
let mut num_left = 0;
for i in 0..indices.len() {
let row_idx = indices[i];
if X.get(row_idx, feature_idx) <= threshold {
indices[i] = indices[num_left];
indices[num_left] = row_idx;
num_left += 1;
}
}
let (left, right) = indices.split_at_mut(num_left);
right.sort();
(left, right)
}
fn split_indices_sparse<'a>(x: &SparseColumnArray,
indices: &'a mut [usize],
feature_idx: usize,
threshold: f32) -> (&'a mut [usize], &'a mut [usize]) {
let mut num_left = 0;
let indices_len = indices.len();
let x = x.view_column(feature_idx);
macro_rules! assign {
($i:expr, $row_idx:expr, $value:expr) => {{
if $value <= threshold {
indices[$i] = indices[num_left];
indices[num_left] = $row_idx;
num_left += 1;
}
}}
}
let mut nonzero_iter = x.iter_nonzero();
let mut nonzero_option = nonzero_iter.next();
let mut i = 0;
while i < indices_len {
let indices_idx = indices[i];
if let Some((nonzero_idx, nonzero_value)) = nonzero_option {
match indices_idx.cmp(&nonzero_idx) {
Ordering::Less => {
assign!(i, indices_idx, 0.0);
i += 1;
},
Ordering::Equal => {
assign!(i, indices_idx, nonzero_value);
i += 1;
nonzero_option = nonzero_iter.next();
},
Ordering::Greater => {
nonzero_option = nonzero_iter.next();
},
}
} else {
assign!(i, indices_idx, 0.0);
i += 1;
}
}
let (left, right) = indices.split_at_mut(num_left);
right.sort();
(left, right)
}
fn calculate_split(values: &FeatureValues) -> (f32, f32) {
let (_, max_value) = values.value_bounds();
let mut split_impurity = f32::INFINITY;
let mut split_x = 0.0;
let total_count = values.count as f32;
let total_y = values.total_y;
let mut cumulative_count = 0.0;
let mut cumulative_y = 0.0;
for &(x, y) in values.xy_pairs.iter() {
if x == 0.0 {
cumulative_count += values.zero_count as f32;
cumulative_y += values.zero_y;
} else {
cumulative_count += 1.0;
cumulative_y += y;
}
if x == max_value {
continue
}
let left_child_proportion = cumulative_count / total_count;
let left_child_positive_probability = cumulative_y / cumulative_count;
let right_child_positive_probability = (total_y - cumulative_y)
/ (total_count - cumulative_count);
let impurity = DecisionTree::proxy_gini_impurity(left_child_proportion,
left_child_positive_probability,
right_child_positive_probability);
if impurity <= split_impurity {
split_impurity = impurity;
split_x = x;
}
}
(split_x, split_impurity)
}
fn proxy_gini_impurity(left_child_proportion: f32,
left_child_probability: f32,
right_child_probability: f32) -> f32 {
let right_child_proportion = 1.0 - left_child_proportion;
let left_impurity = 1.0 - left_child_probability.powi(2)
- (1.0 - left_child_probability).powi(2);
let right_impurity = 1.0 - right_child_probability.powi(2)
- (1.0 - right_child_probability).powi(2);
left_child_proportion * left_impurity
+ right_child_proportion * right_impurity
}
#[allow(unused_variables)]
fn get_values(X: &Array,
y: &Array,
num_positives: usize,
feature_idx: usize,
indices: & [usize],
values: &mut FeatureValues) {
values.clear();
for &row_idx in indices.iter() {
values.push(X.get(row_idx, feature_idx),
y.get(row_idx, 0));
}
values.sort();
}
fn get_values_sparse(x: &SparseColumnArray,
y: &Array,
num_positives: usize,
feature_idx: usize,
indices: &mut [usize],
values: &mut FeatureValues) {
let x = x.view_column(feature_idx);
values.clear();
let x_nnz = x.nnz() as f32;
match x_nnz * (indices.len() as f32).log(2.0) < x_nnz + indices.len() as f32 {
true => { DecisionTree::get_values_sparse_by_search(x, y, num_positives,
indices, values);},
false => { DecisionTree::get_values_sparse_by_iteration(x, y, num_positives,
indices, values); },
}
values.sort();
}
fn get_values_sparse_by_iteration(x: SparseArrayView,
y: &Array,
num_positives: usize,
indices: &mut [usize],
values: &mut FeatureValues) {
let mut indices_iter = indices.iter();
let mut nonzero_iter = x.iter_nonzero();
let mut indices_option = indices_iter.next();
let mut nonzero_option = nonzero_iter.next();
loop {
if let Some(&indices_idx) = indices_option {
if let Some((nonzero_idx, nonzero_value)) = nonzero_option {
match indices_idx.cmp(&nonzero_idx) {
Ordering::Less => {
values.push(0.0, y.get(indices_idx, 0));
indices_option = indices_iter.next();
},
Ordering::Equal => {
values.push(nonzero_value, y.get(nonzero_idx, 0));
indices_option = indices_iter.next();
nonzero_option = nonzero_iter.next();
},
Ordering::Greater => {
nonzero_option = nonzero_iter.next();
},
}
} else {
let remaining_zeros = indices.len() - values.count;
let remaining_positives = num_positives as f32 - values.total_y;
values.fill_remaining_zeros(remaining_zeros,
remaining_positives);
break
}
} else {
break
}
}
}
fn get_values_sparse_by_search(x: SparseArrayView,
y: &Array,
num_positives: usize,
indices: &mut [usize],
values: &mut FeatureValues) {
let y = y.data();
for (row_idx, value) in x.iter_nonzero() {
if let Ok(_) = indices.binary_search(&row_idx) {
values.push(value, y[row_idx]);
}
}
let remaining_zeros = indices.len() - values.count;
let remaining_positives = num_positives as f32 - values.total_y;
values.fill_remaining_zeros(remaining_zeros,
remaining_positives);
}
fn query_tree(&self, node: &Node, x: &Array, row_idx: usize) -> f32 {
match node {
&Node::Interior {feature,
threshold,
ref children} => {
match x.get(row_idx, feature) <= threshold {
true => self.query_tree(&children.0, x, row_idx),
false => self.query_tree(&children.1, x, row_idx),
}
},
&Node::Leaf {probability} => probability,
}
}
fn query_tree_sparse(&self, node: &Node, x: &SparseColumnArray, row_idx: usize) -> f32 {
match node {
&Node::Interior {feature,
threshold,
ref children} => {
match x.get(row_idx, feature) <= threshold {
true => self.query_tree_sparse(&children.0, x, row_idx),
false => self.query_tree_sparse(&children.1, x, row_idx),
}
},
&Node::Leaf {probability} => probability,
}
}
}
#[cfg(test)]
mod tests {
use prelude::*;
use cross_validation::cross_validation::CrossValidation;
use datasets::iris::load_data;
use metrics::accuracy_score;
use multiclass::OneVsRestWrapper;
use super::*;
use super::FeatureValues;
use rand::{StdRng, SeedableRng};
use bincode;
#[cfg(feature = "all_tests")]
use csv;
extern crate time;
#[test]
fn test_gini_impurity() {
let impurity = DecisionTree::proxy_gini_impurity(0.5,
0.5,
0.5);
let expected = 0.5;
assert!(impurity == expected);
let impurity = DecisionTree::proxy_gini_impurity(0.5,
1.0,
0.0);
let expected = 0.0;
assert!(impurity == expected);
let impurity = DecisionTree::proxy_gini_impurity(0.2,
1.0,
0.5);
let expected = 0.8 * 0.5;
assert!(impurity == expected);
}
#[test]
fn calculate_split_1() {
let x = SparseColumnArray
::from(&Array::from(&vec![vec![-1.0],
vec![-0.5],
vec![0.0],
vec![0.0],
vec![0.0],
vec![0.5],
vec![1.0]]));
let y = Array::from(vec![1.0, 1.0, 0.0,
0.0, 0.0, 0.0, 0.0]);
let mut values = FeatureValues::with_capacity(y.rows());
let mut indices = (0..y.rows()).collect::<Vec<_>>();
DecisionTree::get_values_sparse(&x,
&y,
2,
0,
&mut indices[..],
&mut values);
let (threshold, impurity) = DecisionTree::calculate_split(&values);
assert!(threshold == -0.5);
assert!(impurity == 0.0);
}
#[test]
fn calculate_split_2() {
let x = SparseColumnArray
::from(&Array::from(&vec![vec![-1.0],
vec![-0.5],
vec![0.0],
vec![0.0],
vec![0.0],
vec![0.5],
vec![1.0]]));
let y = Array::from(vec![1.0, 1.0, 1.0,
1.0, 1.0, 0.0, 0.0]);
let mut values = FeatureValues::with_capacity(y.rows());
let mut indices = (0..y.rows()).collect::<Vec<_>>();
DecisionTree::get_values_sparse(&x,
&y,
5,
0,
&mut indices[..],
&mut values);
let (threshold, impurity) = DecisionTree::calculate_split(&values);
assert!(threshold == 0.0);
assert!(impurity == 0.0);
}
#[test]
fn test_indices_split() {
let x = Array::from(&vec![vec![-1.0, 1.0],
vec![-0.5, 1.0],
vec![0.5, 0.0],
vec![1.0, 0.0]]);
let mut indices = vec![0, 1, 2, 3];
let (left, right) = DecisionTree::split_indices(&x, &mut indices[..],
0, -0.5);
assert!(left.to_owned() == vec![0, 1]);
assert!(right.to_owned() == vec![2, 3]);
}
#[test]
fn test_indices_split_sparse() {
let x = SparseColumnArray
::from(&Array::from(&vec![vec![-1.0, 1.0],
vec![0.0, 2.0],
vec![0.0, 3.0],
vec![1.0, 0.0],
vec![0.0, 0.0]]));
let mut indices = (0..5).collect::<Vec<_>>();
let ind_bor = &mut indices[..];
let (left, right) = DecisionTree::split_indices_sparse(&x,
ind_bor,
0,
0.0);
assert!(left.to_owned() == vec![0, 1, 2, 4]);
assert!(right.to_owned() == vec![3]);
let mut indices = (0..5).collect::<Vec<_>>();
let ind_bor = &mut indices[..];
let (left, right) = DecisionTree::split_indices_sparse(&x,
ind_bor,
1,
0.5);
assert!(left.to_owned() == vec![3, 4]);
assert!(right.to_owned() == vec![0, 1, 2]);
}
#[test]
fn test_get_values_sparse() {
let x = SparseColumnArray
::from(&Array::from(&vec![vec![-1.0, 1.0],
vec![0.0, 2.0],
vec![0.0, 3.0],
vec![1.0, 0.0],
vec![0.0, 0.0]]));
let y = Array::from(vec![0.0, 1.0, 0.0, 1.0, 1.0]);
let mut values = FeatureValues::with_capacity(5);
let mut indices = (0..5).collect::<Vec<_>>();
DecisionTree::get_values_sparse(&x,
&y,
3,
0,
&mut indices[..],
&mut values);
assert!(values.xy_pairs == vec![(-1.0, 0.0), (0.0, 0.0), (1.0, 1.0)]);
assert!(values.zero_count == 3);
assert!(values.zero_y == 2.0);
assert!(values.total_y == 3.0);
values.clear();
DecisionTree::get_values_sparse(&x,
&y,
3,
1,
&mut indices[..],
&mut values);
assert!(values.xy_pairs == vec![(0.0, 0.0), (1.0, 0.0), (2.0, 1.0), (3.0, 0.0)]);
assert!(values.zero_count == 2);
assert!(values.zero_y == 2.0);
assert!(values.total_y == 3.0);
}
#[test]
fn test_basic_tree_building() {
let X = Array::from(&vec![vec![0.0, 1.0],
vec![1.0, 0.0]]);
let y = Array::from(vec![1.0, 0.0]);
let params = Hyperparameters::new(2);
let mut model = params.build();
model.fit(&X, &y).unwrap();
assert!(allclose(&y, &model.decision_function(&X).unwrap()));
}
#[test]
fn test_decision_tree_iris() {
let (data, target) = load_data();
let mut test_accuracy = 0.0;
let no_splits = 10;
let mut cv = CrossValidation::new(data.rows(),
no_splits);
cv.set_rng(StdRng::from_seed(&[100]));
for (train_idx, test_idx) in cv {
let x_train = data.get_rows(&train_idx);
let x_test = data.get_rows(&test_idx);
let y_train = target.get_rows(&train_idx);
let mut model = Hyperparameters::new(data.cols())
.min_samples_split(5)
.max_features(4)
.rng(StdRng::from_seed(&[100]))
.one_vs_rest();
model.fit(&x_train, &y_train).unwrap();
let test_prediction = model.predict(&x_test).unwrap();
test_accuracy += accuracy_score(
&target.get_rows(&test_idx),
&test_prediction);
}
test_accuracy /= no_splits as f32;
println!("Accuracy {}", test_accuracy);
assert!(test_accuracy > 0.96);
}
#[test]
fn test_decision_tree_iris_sparse() {
let (data, target) = load_data();
let mut test_accuracy = 0.0;
let no_splits = 10;
let mut cv = CrossValidation::new(data.rows(),
no_splits);
cv.set_rng(StdRng::from_seed(&[100]));
for (train_idx, test_idx) in cv {
let x_train = SparseColumnArray::from(&data.get_rows(&train_idx));
let x_test = SparseColumnArray::from(&data.get_rows(&test_idx));
let y_train = target.get_rows(&train_idx);
let mut model = Hyperparameters::new(data.cols())
.min_samples_split(5)
.max_features(4)
.rng(StdRng::from_seed(&[100]))
.one_vs_rest();
model.fit(&x_train, &y_train).unwrap();
let test_prediction = model.predict(&x_test).unwrap();
test_accuracy += accuracy_score(
&target.get_rows(&test_idx),
&test_prediction);
}
test_accuracy /= no_splits as f32;
println!("Accuracy {}", test_accuracy);
assert!(test_accuracy > 0.96);
}
#[test]
fn serialization() {
let (data, target) = load_data();
let mut test_accuracy = 0.0;
let no_splits = 10;
let mut cv = CrossValidation::new(data.rows(),
no_splits);
cv.set_rng(StdRng::from_seed(&[100]));
for (train_idx, test_idx) in cv {
let x_train = data.get_rows(&train_idx);
let x_test = data.get_rows(&test_idx);
let y_train = target.get_rows(&train_idx);
let mut model = Hyperparameters::new(data.cols())
.min_samples_split(5)
.max_features(4)
.rng(StdRng::from_seed(&[100]))
.one_vs_rest();
model.fit(&x_train, &y_train).unwrap();
let encoded = bincode::rustc_serialize::encode(&model,
bincode::SizeLimit::Infinite).unwrap();
let decoded: OneVsRestWrapper<DecisionTree> = bincode::rustc_serialize::decode(&encoded).unwrap();
let test_prediction = decoded.predict(&x_test).unwrap();
test_accuracy += accuracy_score(
&target.get_rows(&test_idx),
&test_prediction);
}
test_accuracy /= no_splits as f32;
println!("Accuracy {}", test_accuracy);
assert!(test_accuracy > 0.96);
}
#[test]
#[cfg(feature = "all_tests")]
fn test_decision_tree_newsgroups() {
use feature_extraction::dict_vectorizer::*;
let mut rdr = csv::Reader::from_file("./test_data/newsgroups/data.csv")
.unwrap()
.has_headers(false);
let mut vectorizer = DictVectorizer::new();
let mut target = Vec::new();
for (row, record) in rdr.decode().enumerate() {
let (y, data): (f32, String) = record.unwrap();
for token in data.split_whitespace() {
vectorizer.partial_fit(row, token, 1.0);
}
target.push(y);
}
let target = Array::from(target);
let X = vectorizer.transform();
let no_splits = 2;
let mut test_accuracy = 0.0;
let mut train_accuracy = 0.0;
let mut cv = CrossValidation::new(X.rows(),
no_splits);
cv.set_rng(StdRng::from_seed(&[100]));
for (train_idx, test_idx) in cv {
let x_train = SparseColumnArray::from(&X.get_rows(&train_idx));
let x_test = SparseColumnArray::from(&X.get_rows(&test_idx));
let y_train = target.get_rows(&train_idx);
let mut model = Hyperparameters::new(X.cols())
.min_samples_split(5)
.rng(StdRng::from_seed(&[100]))
.one_vs_rest();
let start = time::precise_time_ns();
model.fit(&x_train, &y_train).unwrap();
println!("Elapsed {}", time::precise_time_ns() - start);
let y_hat = model.predict(&x_test).unwrap();
let y_hat_train = model.predict(&x_train).unwrap();
test_accuracy += accuracy_score(
&target.get_rows(&test_idx),
&y_hat);
train_accuracy += accuracy_score(
&target.get_rows(&train_idx),
&y_hat_train);
}
test_accuracy /= no_splits as f32;
train_accuracy /= no_splits as f32;
println!("{}", test_accuracy);
println!("train accuracy {}", train_accuracy);
assert!(train_accuracy > 0.95);
}
}
#[cfg(feature = "bench")]
#[allow(unused_imports)]
mod bench {
use prelude::*;
use datasets::iris::load_data;
use super::Hyperparameters;
use rand::{Rng, StdRng, SeedableRng};
use test::Bencher;
#[bench]
fn bench_wide(b: &mut Bencher) {
let rows = 100;
let cols = 5000;
let mut rng = StdRng::new().unwrap();
let mut X = Array::from((0..(rows * cols))
.map(|_| rng.next_f32())
.collect::<Vec<_>>());
X.reshape(rows, cols);
let y = Array::from((0..rows)
.map(|_| *rng.choose(&vec![0.0, 1.0][..]).unwrap())
.collect::<Vec<_>>());
let mut model = Hyperparameters::new(cols)
.min_samples_split(5)
.rng(StdRng::from_seed(&[100]))
.build();
b.iter(|| {
model.fit(&X, &y).unwrap();
});
}
#[bench]
fn bench_tall(b: &mut Bencher) {
let rows = 5000;
let cols = 10;
let mut rng = StdRng::new().unwrap();
let mut X = Array::from((0..(rows * cols))
.map(|_| rng.next_f32())
.collect::<Vec<_>>());
X.reshape(rows, cols);
let y = Array::from((0..rows)
.map(|_| *rng.choose(&vec![0.0, 1.0][..]).unwrap())
.collect::<Vec<_>>());
let mut model = Hyperparameters::new(cols)
.min_samples_split(5)
.rng(StdRng::from_seed(&[100]))
.build();
b.iter(|| {
model.fit(&X, &y).unwrap();
});
}
#[bench]
fn bench_wide_sparse(b: &mut Bencher) {
let rows = 100;
let cols = 5000;
let mut rng = StdRng::new().unwrap();
let mut X = Array::from((0..(rows * cols))
.map(|_| {
match rng.gen_weighted_bool(4) {
true => rng.next_f32(),
false => 0.0,
}
})
.collect::<Vec<_>>());
X.reshape(rows, cols);
let X = SparseColumnArray::from(&X);
let y = Array::from((0..rows)
.map(|_| *rng.choose(&vec![0.0, 1.0][..]).unwrap())
.collect::<Vec<_>>());
let mut model = Hyperparameters::new(cols)
.min_samples_split(5)
.rng(StdRng::from_seed(&[100]))
.build();
b.iter(|| {
model.fit(&X, &y).unwrap();
});
}
#[bench]
fn bench_tall_sparse(b: &mut Bencher) {
let rows = 5000;
let cols = 100;
let mut rng = StdRng::new().unwrap();
let mut X = Array::from((0..(rows * cols))
.map(|_| {
match rng.gen_weighted_bool(4) {
true => rng.next_f32(),
false => 0.0,
}
})
.collect::<Vec<_>>());
X.reshape(rows, cols);
let X = SparseColumnArray::from(&X);
let y = Array::from((0..rows)
.map(|_| *rng.choose(&vec![0.0, 1.0][..]).unwrap())
.collect::<Vec<_>>());
let mut model = Hyperparameters::new(cols)
.min_samples_split(5)
.rng(StdRng::from_seed(&[100]))
.build();
b.iter(|| {
model.fit(&X, &y).unwrap();
});
}
#[bench]
fn bench_decision_tree_iris(b: &mut Bencher) {
let (data, target) = load_data();
let mut model = Hyperparameters::new(data.cols())
.min_samples_split(5)
.max_features(4)
.rng(StdRng::from_seed(&[100]))
.one_vs_rest();
b.iter(|| {
model.fit(&data, &target).unwrap();
});
}
#[bench]
fn bench_decision_tree_newsgroups(b: &mut Bencher) {
extern crate csv;
use feature_extraction::dict_vectorizer::*;
let mut rdr = csv::Reader::from_file("./test_data/newsgroups/data.csv")
.unwrap()
.has_headers(false);
let mut vectorizer = DictVectorizer::new();
let mut target = Vec::new();
for (row, record) in rdr.decode().enumerate() {
let (y, data): (f32, String) = record.unwrap();
for token in data.split_whitespace() {
vectorizer.partial_fit(row, token, 1.0);
}
target.push(y);
}
let target = Array::from(target).get_rows(&(..500));
let X = vectorizer.transform();
let x_train = SparseColumnArray::from(&X.get_rows(&(..500)));
let mut model = Hyperparameters::new(X.cols())
.min_samples_split(5)
.rng(StdRng::from_seed(&[100]))
.one_vs_rest();
b.iter(|| {
model.fit(&x_train, &target).unwrap();
});
}
}