use rayon::prelude::*;
use serde::{
Serialize,
Deserialize
};
use crate::{Sample, Feature};
use crate::weak_learner::common::type_and_struct::*;
#[derive(Clone, Copy, Debug, Serialize, Deserialize)]
pub enum LossType {
L1,
L2,
}
impl LossType {
pub(super) fn best_split<'a>(
&self,
sample: &'a Sample,
dist: &[f64],
idx: &[usize],
) -> (&'a str, Threshold)
{
let target = sample.target();
let target = &target[..];
sample.features()
.into_par_iter()
.map(|feature| {
let (loss_value, threshold) = self.best_split_at(
feature, target, dist, &idx[..]
);
(loss_value, feature.name(), threshold)
})
.min_by(|x, y| x.0.partial_cmp(&y.0).unwrap())
.map(|(_, name, threshold)| (name, threshold))
.expect(
"\
No feature that minimizes \
the weighted loss.\
"
)
}
fn best_split_at(
&self,
column: &Feature,
target: &[f64],
dist: &[f64],
idx: &[usize],
) -> (LossValue, Threshold)
{
match self {
LossType::L1 => best_split_l1(column, target, dist, idx),
LossType::L2 => best_split_l2(column, target, dist, idx),
}
}
pub(super) fn prediction_and_loss(
&self,
target: &[f64],
indices: &[usize],
dist: &[f64],
) -> (Prediction<f64>, LossValue)
{
let tuples = indices.into_par_iter()
.copied()
.map(|i| {
let y = target[i];
(dist[i], y)
})
.collect::<Vec<(f64, f64)>>();
let sum_dist = tuples.iter()
.map(|(d, _)| d)
.sum::<f64>();
if sum_dist == 0.0 {
return (Prediction::from(0.0), LossValue::from(0.0));
}
let prediction;
let loss_value;
match self {
LossType::L1 => {
let triplets = tuples.into_iter()
.map(|(d, y)| (0.0.into(), d.into(), y.into()))
.collect::<Vec<(FeatureValue, Mass, Target)>>();
(prediction, loss_value) = inner_prediction_and_loss(
&triplets[..],
);
},
LossType::L2 => {
prediction = Prediction::from(
tuples.iter()
.map(|(d, y)| *d * *y)
.sum::<f64>()
/ sum_dist
);
loss_value = LossValue::from(
tuples.into_iter()
.map(|(d, y)| d * (y - prediction.0).powi(2))
.sum::<f64>()
/ sum_dist
);
}
}
(prediction, loss_value)
}
}
fn best_split_l1(
column: &Feature,
target: &[f64],
dist: &[f64],
idx: &[usize]
) -> (LossValue, Threshold)
{
let mut triplets = idx.into_par_iter()
.copied()
.map(|i| {
let x = column[i];
let y = target[i];
(FeatureValue::from(x), Mass::from(dist[i]), Target::from(y))
})
.collect::<Vec<_>>();
triplets.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
let mut node = BestSplitFinderL1::new(triplets);
let mut best_loss_value;
let mut best_threshold;
(best_loss_value, best_threshold) = node.lad_and_threshold();
while let Some((loss_value, threshold)) = node.move_to_left() {
if loss_value < best_loss_value {
best_loss_value = loss_value;
best_threshold = threshold;
}
}
let loss = LossValue::from(best_loss_value);
let threshold = Threshold::from(best_threshold);
(loss, threshold)
}
fn best_split_l2(
column: &Feature,
target: &[f64],
dist: &[f64],
idx: &[usize]
) -> (LossValue, Threshold)
{
let mut triplets = idx.into_par_iter()
.copied()
.map(|i| {
let x = column[i];
let y = target[i];
(x, dist[i], y)
})
.collect::<Vec<(f64, f64, f64)>>();
triplets.sort_by(|(x1, _, _), (x2, _, _)| x1.partial_cmp(x2).unwrap());
let mut node = BestSplitFinderL2::new(triplets);
let mut best_variance;
let mut best_threshold;
(best_variance, best_threshold) = node.variance_and_threshold();
while let Some((variance, threshold)) = node.move_to_left() {
if variance < best_variance {
best_variance = variance;
best_threshold = threshold;
}
}
let loss = LossValue::from(best_variance);
let threshold = Threshold::from(best_threshold);
(loss, threshold)
}
struct BestSplitFinderL2 {
left: Vec<(f64, f64, f64)>,
right: Vec<(f64, f64, f64)>,
left_sum_target: f64,
right_sum_target: f64,
left_sum_dist: f64,
right_sum_dist: f64,
}
impl BestSplitFinderL2 {
pub(self) fn new(triplets: Vec<(f64, f64, f64)>) -> Self {
let m = triplets.len();
let left = Vec::with_capacity(m);
let right = triplets;
let left_sum_target: f64 = 0.0;
let mut right_sum_target: f64 = 0.0;
let left_sum_dist: f64 = 0.0;
let mut right_sum_dist: f64 = 0.0;
right.iter()
.rev()
.for_each(|(_, d, target)| {
right_sum_target += d * target;
right_sum_dist += d;
});
Self {
left,
right,
left_sum_target,
right_sum_target,
left_sum_dist,
right_sum_dist,
}
}
pub(self) fn variance_and_threshold(&self) -> (f64, f64) {
let threshold;
if self.right.is_empty() {
let left_largest = self.left.last()
.map(|(x, _, _)| x).unwrap();
threshold = left_largest + 1.0;
} else {
let right_smallest = self.right.last()
.map(|(x, _, _)| x)
.unwrap();
let left_largest = self.left.last()
.map(|(x, _, _)| *x)
.unwrap_or(right_smallest - 2.0);
threshold = (left_largest + right_smallest) / 2.0;
}
let left_variance = calc_variance(
&self.left[..], self.left_sum_target, self.left_sum_dist,
);
let right_variance = calc_variance(
&self.right[..], self.right_sum_target, self.right_sum_dist,
);
let left_prob = self.left_sum_dist
/ (self.left_sum_dist + self.right_sum_dist);
let right_prob = 1.0_f64 - left_prob;
let variance = left_prob * left_variance + right_prob * right_variance;
(variance, threshold)
}
pub(self) fn move_to_left(&mut self) -> Option<(f64, f64)> {
if let Some(r) = self.right.pop() {
self.right_sum_dist -= r.1;
self.right_sum_target -= r.1 * r.2;
self.left_sum_dist += r.1;
self.left_sum_target += r.1 * r.2;
self.left.push(r);
while let Some(r2) = self.right.pop() {
if r.0 == r2.0 {
self.right_sum_dist -= r2.1;
self.right_sum_target -= r2.1 * r2.2;
self.left_sum_dist += r2.1;
self.left_sum_target += r2.1 * r2.2;
self.left.push(r2);
} else {
self.right.push(r2);
break;
}
}
let variance_threshold = self.variance_and_threshold();
Some(variance_threshold)
} else {
None
}
}
}
fn calc_variance(triplets: &[(f64, f64, f64)], y_sum: f64, sum_dist: f64)
-> f64
{
if sum_dist == 0.0 || triplets.is_empty() {
return 0.0;
}
let y_mean = y_sum / sum_dist;
triplets.into_par_iter()
.map(|(_, d, y)| d * (y - y_mean).powi(2))
.sum::<f64>()
/ sum_dist
}
struct BestSplitFinderL1 {
left: Vec<(FeatureValue, Mass, Target)>,
right: Vec<(FeatureValue, Mass, Target)>,
}
impl BestSplitFinderL1 {
pub(self) fn new(triplets: Vec<(FeatureValue, Mass, Target)>)
-> Self
{
let m = triplets.len();
let left = Vec::with_capacity(m);
let right = triplets;
Self {
left,
right,
}
}
pub(self) fn lad_and_threshold(&self)
-> (LossValue, Threshold)
{
let threshold;
if self.right.is_empty() {
let left_largest = self.left.last()
.map(|(x, _, _)| x.0)
.unwrap();
threshold = left_largest + 1.0;
} else {
let right_smallest = self.right.last()
.map(|(x, _, _)| x.0)
.unwrap();
let left_largest = self.left.last()
.map(|(x, _, _)| x.0)
.unwrap_or(right_smallest - 2.0);
threshold = (left_largest + right_smallest) / 2.0;
}
let threshold = Threshold::from(threshold);
let left_lad = inner_prediction_and_loss(&self.left[..]).1;
let right_lad = inner_prediction_and_loss(&self.right[..]).1;
let lad = left_lad + right_lad;
(lad, threshold)
}
pub(self) fn move_to_left(&mut self)
-> Option<(LossValue, Threshold)>
{
if let Some(r) = self.right.pop() {
self.left.push(r);
let feature = r.0;
while let Some(r2) = self.right.pop() {
if feature == r2.0 {
self.left.push(r2);
} else {
self.right.push(r2);
break;
}
}
let lad_threshold = self.lad_and_threshold();
Some(lad_threshold)
} else {
None
}
}
}
fn inner_prediction_and_loss(triplets: &[(FeatureValue, Mass, Target)])
-> (Prediction<f64>, LossValue)
{
let m_total = triplets.iter()
.map(|(_, m, _)| m.0)
.sum::<f64>();
if m_total == 0.0 {
let prediction = 0.0.into();
let loss = 0.0.into();
return (prediction, loss);
}
let mut triplets = triplets.to_vec();
triplets.sort_by(|(_, _, t1), (_, _, t2)|
t1.partial_cmp(&t2).unwrap()
);
let mut p_small = 1e9;
let mut p_large = 1e9;
let mut m_partial_sum = 0.0;
for (_, m, t) in triplets.iter() {
m_partial_sum += m.0;
if m_partial_sum >= 0.5 * m_total {
p_large = t.0;
if p_small == 1e9 {
p_small = t.0;
}
break;
}
p_small = t.0;
}
assert!(p_small < 1e9);
assert!(p_large < 1e9);
let loss_p_small = triplets.par_iter()
.map(|(_, m, t)| m.0 * (t.0 - p_small).abs())
.sum::<f64>();
let loss_p_large = triplets.par_iter()
.map(|(_, m, t)| m.0 * (t.0 - p_large).abs())
.sum::<f64>();
let prediction;
let loss;
if loss_p_small < loss_p_large {
prediction = p_small.into();
loss = loss_p_small.into();
} else {
prediction = p_large.into();
loss = loss_p_large.into();
}
(prediction, loss)
}