pub(super) fn mean_f32(values: &[f32]) -> f32 {
if values.is_empty() {
0.0
} else {
values.iter().sum::<f32>() / values.len() as f32
}
}
pub(super) fn variance_f32(y: &[f32]) -> f32 {
if y.len() <= 1 {
return 0.0;
}
let mean = mean_f32(y);
let sum_squared_diff: f32 = y.iter().map(|&val| (val - mean).powi(2)).sum();
sum_squared_diff / y.len() as f32
}
pub(super) fn compute_mse(y_left: &[f32], y_right: &[f32]) -> f32 {
contract_pre_mse_split!();
let n_left = y_left.len() as f32;
let n_right = y_right.len() as f32;
let n_total = n_left + n_right;
if n_total == 0.0 {
return 0.0;
}
let var_left = variance_f32(y_left);
let var_right = variance_f32(y_right);
(n_left / n_total) * var_left + (n_right / n_total) * var_right
}
pub(super) fn get_unique_feature_values(
x: &crate::primitives::Matrix<f32>,
feature_idx: usize,
n_samples: usize,
) -> Vec<f32> {
let mut values: Vec<f32> = (0..n_samples).map(|i| x.get(i, feature_idx)).collect();
values.sort_by(|a, b| a.total_cmp(b));
values.dedup();
values
}
pub(super) fn split_by_threshold(
x: &crate::primitives::Matrix<f32>,
y: &[f32],
feature_idx: usize,
threshold: f32,
) -> (Vec<f32>, Vec<f32>) {
let mut y_left = Vec::new();
let mut y_right = Vec::new();
for (row, &y_val) in y.iter().enumerate() {
if x.get(row, feature_idx) <= threshold {
y_left.push(y_val);
} else {
y_right.push(y_val);
}
}
(y_left, y_right)
}
pub(super) fn evaluate_split_gain(
y_left: &[f32],
y_right: &[f32],
current_variance: f32,
) -> Option<f32> {
if y_left.is_empty() || y_right.is_empty() {
return None;
}
let split_mse = compute_mse(y_left, y_right);
let gain = current_variance - split_mse;
(gain > 0.0).then_some(gain)
}
pub(super) fn find_best_regression_split_for_feature(
x: &crate::primitives::Matrix<f32>,
y: &[f32],
feature_idx: usize,
n_samples: usize,
current_variance: f32,
) -> Option<(f32, f32)> {
let feature_values = get_unique_feature_values(x, feature_idx, n_samples);
let mut best_threshold = 0.0;
let mut best_gain = 0.0;
for i in 0..feature_values.len().saturating_sub(1) {
let threshold = f32::midpoint(feature_values[i], feature_values[i + 1]);
let (y_left, y_right) = split_by_threshold(x, y, feature_idx, threshold);
if let Some(gain) = evaluate_split_gain(&y_left, &y_right, current_variance) {
if gain > best_gain {
best_gain = gain;
best_threshold = threshold;
}
}
}
(best_gain > 0.0).then_some((best_threshold, best_gain))
}
pub(super) fn find_best_regression_split(
x: &crate::primitives::Matrix<f32>,
y: &[f32],
) -> Option<(usize, f32, f32)> {
let (n_samples, n_features) = x.shape();
if n_samples < 2 {
return None;
}
let current_variance = variance_f32(y);
let mut best_gain = 0.0;
let mut best_feature = 0;
let mut best_threshold = 0.0;
for feature_idx in 0..n_features {
if let Some((threshold, gain)) =
find_best_regression_split_for_feature(x, y, feature_idx, n_samples, current_variance)
{
if gain > best_gain {
best_gain = gain;
best_feature = feature_idx;
best_threshold = threshold;
}
}
}
(best_gain > 0.0).then_some((best_feature, best_threshold, best_gain))
}
pub(super) fn split_regression_data_by_indices(
x: &crate::primitives::Matrix<f32>,
y: &[f32],
indices: &[usize],
) -> (crate::primitives::Matrix<f32>, Vec<f32>) {
let (_n_samples, n_features) = x.shape();
let n_subset = indices.len();
let mut subset_data = Vec::with_capacity(n_subset * n_features);
let mut subset_labels = Vec::with_capacity(n_subset);
for &idx in indices {
for col in 0..n_features {
subset_data.push(x.get(idx, col));
}
subset_labels.push(y[idx]);
}
let subset_matrix = crate::primitives::Matrix::from_vec(n_subset, n_features, subset_data)
.expect("subset data length matches n_subset * n_features");
(subset_matrix, subset_labels)
}
pub(super) fn make_regression_leaf(y_slice: &[f32], n_samples: usize) -> RegressionTreeNode {
RegressionTreeNode::Leaf(RegressionLeaf {
value: mean_f32(y_slice),
n_samples,
})
}
pub(super) fn at_max_depth(depth: usize, max_depth: Option<usize>) -> bool {
max_depth.is_some_and(|max_d| depth >= max_d)
}
pub(super) fn partition_by_threshold(
x: &crate::primitives::Matrix<f32>,
n_samples: usize,
feature_idx: usize,
threshold: f32,
) -> (Vec<usize>, Vec<usize>) {
let mut left = Vec::new();
let mut right = Vec::new();
for row in 0..n_samples {
if x.get(row, feature_idx) <= threshold {
left.push(row);
} else {
right.push(row);
}
}
(left, right)
}
pub(super) fn build_regression_tree(
x: &crate::primitives::Matrix<f32>,
y: &crate::primitives::Vector<f32>,
depth: usize,
max_depth: Option<usize>,
min_samples_split: usize,
min_samples_leaf: usize,
) -> RegressionTreeNode {
let n_samples = y.len();
let y_slice: Vec<f32> = y.as_slice().to_vec();
if n_samples < min_samples_split
|| at_max_depth(depth, max_depth)
|| variance_f32(&y_slice) < 1e-10
{
return make_regression_leaf(&y_slice, n_samples);
}
let Some((feature_idx, threshold, _gain)) = find_best_regression_split(x, &y_slice) else {
return make_regression_leaf(&y_slice, n_samples);
};
let (left_indices, right_indices) =
partition_by_threshold(x, n_samples, feature_idx, threshold);
if left_indices.len() < min_samples_leaf || right_indices.len() < min_samples_leaf {
return make_regression_leaf(&y_slice, n_samples);
}
let (left_matrix, left_labels) = split_regression_data_by_indices(x, &y_slice, &left_indices);
let (right_matrix, right_labels) =
split_regression_data_by_indices(x, &y_slice, &right_indices);
let left_child = build_regression_tree(
&left_matrix,
&crate::primitives::Vector::from_vec(left_labels),
depth + 1,
max_depth,
min_samples_split,
min_samples_leaf,
);
let right_child = build_regression_tree(
&right_matrix,
&crate::primitives::Vector::from_vec(right_labels),
depth + 1,
max_depth,
min_samples_split,
min_samples_leaf,
);
RegressionTreeNode::Node(RegressionNode {
feature_idx,
threshold,
left: Box::new(left_child),
right: Box::new(right_child),
})
}
pub(super) fn compute_tree_feature_importances(node: &TreeNode, importances: &mut [f32]) {
match node {
TreeNode::Leaf(_) => {
}
TreeNode::Node(n) => {
let n_samples = count_tree_samples(node) as f32;
importances[n.feature_idx] += n_samples;
compute_tree_feature_importances(&n.left, importances);
compute_tree_feature_importances(&n.right, importances);
}
}
}
pub(super) fn count_tree_samples(node: &TreeNode) -> usize {
match node {
TreeNode::Leaf(leaf) => leaf.n_samples,
TreeNode::Node(n) => {
count_tree_samples(&n.left) + count_tree_samples(&n.right)
}
}
}
pub(super) fn compute_regression_tree_feature_importances(
node: &RegressionTreeNode,
importances: &mut [f32],
) {
match node {
RegressionTreeNode::Leaf(_) => {
}
RegressionTreeNode::Node(n) => {
let n_samples = count_regression_tree_samples(node) as f32;
importances[n.feature_idx] += n_samples;
compute_regression_tree_feature_importances(&n.left, importances);
compute_regression_tree_feature_importances(&n.right, importances);
}
}
}
pub(super) fn count_regression_tree_samples(node: &RegressionTreeNode) -> usize {
match node {
RegressionTreeNode::Leaf(leaf) => leaf.n_samples,
RegressionTreeNode::Node(n) => {
count_regression_tree_samples(&n.left) + count_regression_tree_samples(&n.right)
}
}
}
pub(super) fn bootstrap_sample(n_samples: usize, random_state: Option<u64>) -> Vec<usize> {
use rand::distr::{Distribution, Uniform};
use rand::SeedableRng;
let dist = Uniform::new(0, n_samples).expect("valid uniform range: 0..n_samples");
let mut indices = Vec::with_capacity(n_samples);
if let Some(seed) = random_state {
let mut rng = rand::rngs::StdRng::seed_from_u64(seed);
for _ in 0..n_samples {
indices.push(dist.sample(&mut rng));
}
} else {
let mut rng = rand::rng();
for _ in 0..n_samples {
indices.push(dist.sample(&mut rng));
}
}
indices
}