1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
use crate::data::{FloatData, JaggedMatrix, Matrix};
use crate::errors::ForustError;
use crate::utils::{is_missing, map_bin, percentiles};
/// If there are fewer unique values than their are
/// percentiles, just return the unique values of the
/// vectors.
///
/// * `v` - A numeric slice to calculate percentiles for.
/// * `sample_weight` - Instance weights for each row in the data.
fn percentiles_or_value<T>(v: &[T], sample_weight: &[T], pcts: &[T]) -> Vec<T>
where
T: FloatData<T>,
{
let mut v_u = v.to_owned();
v_u.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap());
v_u.dedup();
if v_u.len() <= pcts.len() + 1 {
v_u
} else {
percentiles(v, sample_weight, pcts)
}
}
// We want to be able to bin our dataset into discrete buckets.
// First we will calculate percentiles and the number of unique values
// for each feature.
// Then we will bucket them into bins from 0 to N + 1 where N is the number
// of unique bin values created from the percentiles, and the very last
// bin is missing values.
// For now, we will just use usize, although, it would be good to see if
// we can use something smaller, u8 for instance.
// If we generated these cuts:
// [0.0, 7.8958, 14.4542, 31.0, 512.3292, inf]
// We would have a number with bins 0 (missing), 1 [MIN, 0.0), 2 (0.0, 7], 3 [], 4, 5
// a split that is [feature < 5] would translate to [feature < 31.0 ]
pub struct BinnedData<T> {
pub binned_data: Vec<u16>,
pub cuts: JaggedMatrix<T>,
pub nunique: Vec<usize>,
}
/// Convert a matrix of data, into a binned matrix.
///
/// * `data` - Numeric data to be binned.
/// * `cuts` - A slice of Vectors, where the vectors are the corresponding
/// cut values for each of the columns.
fn bin_matrix_from_cuts<T: std::cmp::PartialOrd>(
data: &Matrix<T>,
cuts: &JaggedMatrix<T>,
) -> Vec<u16> {
// loop through the matrix, binning the data.
// We will determine the column we are in, by
// using the modulo operator, on the record value.
data.data
.iter()
.enumerate()
.map(|(i, v)| {
let col = i / data.rows;
// This will always be smaller than u16::MAX so we
// are good to just unwrap here.
map_bin(cuts.get_col(col), v).unwrap()
})
.collect()
}
/// Bin a numeric matrix.
///
/// * `data` - A numeric matrix, of data to be binned.
/// * `sample_weight` - Instance weights for each row of the data.
/// * `nbins` - The number of bins each column should be binned into.
/// * `missing` - Float value to consider as missing.
pub fn bin_matrix(
data: &Matrix<f64>,
sample_weight: &[f64],
nbins: u16,
missing: f64,
) -> Result<BinnedData<f64>, ForustError> {
let mut pcts = Vec::new();
let nbins_ = f64::from_u16(nbins);
for i in 0..nbins {
let v = f64::from_u16(i) / nbins_;
pcts.push(v);
}
// First we need to generate the bins for each of the columns.
// We will loop through all of the columns, and generate the cuts.
let mut cuts = JaggedMatrix::new();
let mut nunique = Vec::new();
for i in 0..data.cols {
let (no_miss, w): (Vec<f64>, Vec<f64>) = data
.get_col(i)
.iter()
.zip(sample_weight.iter())
// It is unrecoverable if they have provided missing values in
// the data other than the specificized missing.
.filter(|(v, _)| !is_missing(v, &missing))
.unzip();
assert_eq!(no_miss.len(), w.len());
let mut col_cuts = percentiles_or_value(&no_miss, &w, &pcts);
col_cuts.push(f64::MAX);
col_cuts.dedup();
if col_cuts.len() < 2 {
return Err(ForustError::NoVariance(i));
}
// There will be one less bins, then there are cuts.
// The first value will be for missing.
nunique.push(col_cuts.len());
let l = col_cuts.len();
cuts.data.extend(col_cuts);
let e = match cuts.ends.last() {
Some(v) => v + l,
None => l,
};
cuts.ends.push(e);
cuts.cols = cuts.ends.len();
cuts.n_records = cuts.ends.iter().sum();
}
let binned_data = bin_matrix_from_cuts(data, &cuts);
Ok(BinnedData {
binned_data,
cuts,
nunique,
})
}
#[cfg(test)]
mod tests {
use super::*;
use std::fs;
#[test]
fn test_bin_data() {
let file = fs::read_to_string("resources/contiguous_no_missing.csv")
.expect("Something went wrong reading the file");
let data_vec: Vec<f64> = file.lines().map(|x| x.parse::<f64>().unwrap()).collect();
let data = Matrix::new(&data_vec, 891, 5);
let sample_weight = vec![1.; data.rows];
let b = bin_matrix(&data, &sample_weight, 50, f64::NAN).unwrap();
let bdata = Matrix::new(&b.binned_data, data.rows, data.cols);
for column in 0..data.cols {
let mut b_compare = 1;
for cuts in b.cuts.get_col(column).windows(2) {
let c1 = cuts[0];
let c2 = cuts[1];
let mut n_v = 0;
let mut n_b = 0;
for (bin, value) in bdata.get_col(column).iter().zip(data.get_col(column)) {
if *bin == b_compare {
n_b += 1;
}
if (c1 <= *value) && (*value < c2) {
n_v += 1;
}
}
assert_eq!(n_v, n_b);
b_compare += 1;
}
}
}
}