use crate::data::{FloatData, JaggedMatrix, Matrix};
use rayon::prelude::*;
use serde::{Deserialize, Serialize};
pub struct SortBuffer {
pub grad: Vec<f32>,
pub hess: Vec<f32>,
}
impl SortBuffer {
pub fn new() -> Self {
SortBuffer {
grad: Vec::new(),
hess: Vec::new(),
}
}
fn fill_sorted(&mut self, grad: &[f32], hess: &[f32], index: &[usize]) {
self.grad.clear();
self.hess.clear();
self.grad.reserve(index.len().saturating_sub(self.grad.capacity()));
self.hess.reserve(index.len().saturating_sub(self.hess.capacity()));
for &i in index {
self.grad.push(grad[i]);
self.hess.push(hess[i]);
}
}
}
impl Default for SortBuffer {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Deserialize, Serialize, Clone)]
pub struct Bin<T> {
pub gradient_sum: T,
pub hessian_sum: T,
pub cut_value: f64,
}
impl Bin<f32> {
pub fn new_f32(cut_value: f64) -> Self {
Bin {
gradient_sum: f32::ZERO,
hessian_sum: f32::ZERO,
cut_value,
}
}
pub fn from_parent_child(root_bin: &Bin<f32>, child_bin: &Bin<f32>) -> Self {
Bin {
gradient_sum: root_bin.gradient_sum - child_bin.gradient_sum,
hessian_sum: root_bin.hessian_sum - child_bin.hessian_sum,
cut_value: root_bin.cut_value,
}
}
pub fn from_parent_two_children(
root_bin: &Bin<f32>,
first_child_bin: &Bin<f32>,
second_child_bin: &Bin<f32>,
) -> Self {
Bin {
gradient_sum: root_bin.gradient_sum
- (first_child_bin.gradient_sum + second_child_bin.gradient_sum),
hessian_sum: root_bin.hessian_sum
- (first_child_bin.hessian_sum + second_child_bin.hessian_sum),
cut_value: root_bin.cut_value,
}
}
}
impl Bin<f64> {
pub fn new_f64(cut_value: f64) -> Self {
Bin {
gradient_sum: f64::ZERO,
hessian_sum: f64::ZERO,
cut_value,
}
}
pub fn as_f32_bin(&self) -> Bin<f32> {
Bin {
gradient_sum: self.gradient_sum as f32,
hessian_sum: self.hessian_sum as f32,
cut_value: self.cut_value,
}
}
}
#[derive(Debug, Deserialize, Serialize)]
pub struct HistogramMatrix(pub JaggedMatrix<Bin<f32>>);
pub fn create_feature_histogram(
feature: &[u16],
cuts: &[f64],
sorted_grad: &[f32],
sorted_hess: &[f32],
index: &[usize],
) -> Vec<Bin<f32>> {
let mut histogram: Vec<Bin<f64>> = Vec::with_capacity(cuts.len());
histogram.push(Bin::new_f64(f64::NAN));
histogram.extend(cuts[..(cuts.len() - 1)].iter().map(|c| Bin::new_f64(*c)));
index
.iter()
.zip(sorted_grad)
.zip(sorted_hess)
.for_each(|((i, g), h)| {
let bin = feature[*i] as usize;
histogram[bin].gradient_sum += f64::from(*g);
histogram[bin].hessian_sum += f64::from(*h);
});
histogram.iter().map(|b| b.as_f32_bin()).collect()
}
impl HistogramMatrix {
pub fn empty() -> Self {
HistogramMatrix(JaggedMatrix {
data: Vec::new(),
ends: Vec::new(),
cols: 0,
n_records: 0,
})
}
#[allow(clippy::too_many_arguments)]
pub fn new(
data: &Matrix<u16>,
cuts: &JaggedMatrix<f64>,
grad: &[f32],
hess: &[f32],
index: &[usize],
col_index: &[usize],
parallel: bool,
sort: bool,
sort_buffer: &mut SortBuffer,
) -> Self {
let (sorted_grad, sorted_hess): (&[f32], &[f32]) = if !sort {
(grad, hess)
} else {
sort_buffer.fill_sorted(grad, hess, index);
(&sort_buffer.grad, &sort_buffer.hess)
};
let histograms = if parallel {
col_index
.par_iter()
.flat_map(|col| {
create_feature_histogram(
data.get_col(*col),
cuts.get_col(*col),
sorted_grad,
sorted_hess,
index,
)
})
.collect::<Vec<Bin<f32>>>()
} else {
col_index
.iter()
.flat_map(|col| {
create_feature_histogram(
data.get_col(*col),
cuts.get_col(*col),
sorted_grad,
sorted_hess,
index,
)
})
.collect::<Vec<Bin<f32>>>()
};
let ends: Vec<usize> = if col_index.len() == data.cols {
cuts.ends.to_owned()
} else {
col_index
.iter()
.scan(0_usize, |state, i| {
*state += cuts.get_col(*i).len();
Some(*state)
})
.collect()
};
let n_records = if col_index.len() == data.cols {
cuts.n_records
} else {
ends.iter().sum()
};
HistogramMatrix(JaggedMatrix {
data: histograms,
ends,
cols: col_index.len(),
n_records,
})
}
pub fn from_parent_child(
root_histogram: &HistogramMatrix,
child_histogram: &HistogramMatrix,
) -> Self {
let HistogramMatrix(root) = root_histogram;
let HistogramMatrix(child) = child_histogram;
let histograms = root
.data
.iter()
.zip(child.data.iter())
.map(|(root_bin, child_bin)| Bin::from_parent_child(root_bin, child_bin))
.collect();
HistogramMatrix(JaggedMatrix {
data: histograms,
ends: child.ends.to_owned(),
cols: child.cols,
n_records: child.n_records,
})
}
pub fn from_parent_two_children(
root_histogram: &HistogramMatrix,
first_child_histogram: &HistogramMatrix,
second_child_histogram: &HistogramMatrix,
) -> Self {
let HistogramMatrix(root) = root_histogram;
let HistogramMatrix(first_child) = first_child_histogram;
let HistogramMatrix(second_child) = second_child_histogram;
let histograms = root
.data
.iter()
.zip(first_child.data.iter())
.zip(second_child.data.iter())
.map(|((root_bin, first_child_bin), second_child_bin)| {
Bin::from_parent_two_children(root_bin, first_child_bin, second_child_bin)
})
.collect();
HistogramMatrix(JaggedMatrix {
data: histograms,
ends: first_child.ends.to_owned(),
cols: first_child.cols,
n_records: first_child.n_records,
})
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::binning::bin_matrix;
use crate::objective::{LogLoss, ObjectiveFunction};
use std::fs;
#[test]
fn test_single_histogram() {
let file = fs::read_to_string("resources/contiguous_no_missing.csv")
.expect("Something went wrong reading the file");
let data_vec: Vec<f64> = file.lines().map(|x| x.parse::<f64>().unwrap()).collect();
let data = Matrix::new(&data_vec, 891, 5);
let sample_weight = vec![1.; data.rows];
let b = bin_matrix(&data, &sample_weight, 10, f64::NAN).unwrap();
let bdata = Matrix::new(&b.binned_data, data.rows, data.cols);
let y: Vec<f64> = file.lines().map(|x| x.parse::<f64>().unwrap()).collect();
let yhat = vec![0.5; y.len()];
let w = vec![1.; y.len()];
let (g, h) = LogLoss::calc_grad_hess(&y, &yhat, &w);
let hist =
create_feature_histogram(&bdata.get_col(1), &b.cuts.get_col(1), &g, &h, &bdata.index);
let mut f = bdata.get_col(1).to_owned();
println!("{:?}", hist);
f.sort();
f.dedup();
assert_eq!(f.len() + 1, hist.len());
}
}