use perpetual::{Matrix, objective::Objective};
use std::hint::black_box;
use std::time::Instant;
fn main() {
let file = std::fs::read_to_string("resources/cal_housing_train.csv").expect("read");
let mut lines = file.lines();
lines.next();
let n_features = 8;
let mut columns: Vec<Vec<f64>> = vec![Vec::new(); n_features];
let mut y: Vec<f64> = Vec::new();
for line in lines {
let vals: Vec<f64> = line.split(',').map(|x| x.parse().unwrap()).collect();
y.push(vals[n_features]);
for j in 0..n_features {
columns[j].push(vals[j]);
}
}
let data_vec: Vec<f64> = columns.into_iter().flatten().collect();
let _data = Matrix::new(&data_vec, y.len(), n_features);
let n = y.len();
eprintln!("N={} F={}", n, n_features);
let obj = Objective::SquaredLoss;
let yhat_val = 2.0_f64;
let y_val = 1.5_f64;
let n_iters = 1_000_000;
let t = Instant::now();
let mut acc = 0.0f32;
for _ in 0..n_iters {
acc += black_box(obj.loss_single(black_box(y_val), black_box(yhat_val), None));
}
let loss_single_ns = t.elapsed().as_nanos() as f64 / n_iters as f64;
eprintln!("loss_single: {:.1}ns (acc={})", loss_single_ns, acc);
let node_size = 1000;
let yhat = vec![2.0_f64; n];
let loss = vec![0.5_f32; n];
let mut loss_decr = vec![0.0_f32; n];
let mut loss_decr_avg;
let index_length = n as f32;
let weight = 0.1_f64;
let indices: Vec<usize> = (0..node_size).collect();
let n_reps = 10000;
let t = Instant::now();
for _rep in 0..n_reps {
loss_decr_avg = 0.0;
for i in indices.iter() {
let _i = *i;
let yhat_new = yhat[_i] + weight;
let loss_new = obj.loss_single(y[_i], yhat_new, None);
loss_decr_avg -= loss_decr[_i] / index_length;
loss_decr[_i] = loss[_i] - loss_new;
loss_decr_avg += loss_decr[_i] / index_length;
}
black_box(loss_decr_avg);
}
let loss_loop_us = t.elapsed().as_micros() as f64 / n_reps as f64;
eprintln!(
"loss_decr loop (n={}): {:.1}μs ({:.1}ns/sample)",
node_size,
loss_loop_us,
loss_loop_us * 1000.0 / node_size as f64
);
let n_pivot = 16000;
let mut idx: Vec<usize> = (0..n_pivot).collect();
let feature_col: Vec<u16> = (0..n_pivot as u16).map(|x| x % 256).collect();
let split_bin = 128_u16;
let n_reps = 10000;
let t = Instant::now();
for _rep in 0..n_reps {
let mut lo = 0;
let mut hi = n_pivot - 1;
while lo < hi {
while lo < hi && feature_col[idx[lo]] < split_bin {
lo += 1;
}
while lo < hi && feature_col[idx[hi]] >= split_bin {
hi -= 1;
}
if lo < hi {
idx.swap(lo, hi);
lo += 1;
hi -= 1;
}
}
black_box(&idx);
for (i, item) in idx.iter_mut().enumerate() {
*item = i;
}
}
let pivot_us = t.elapsed().as_micros() as f64 / n_reps as f64;
eprintln!(
"pivot (n={}): {:.1}μs ({:.1}ns/sample)",
n_pivot,
pivot_us,
pivot_us * 1000.0 / n_pivot as f64
);
let n_accum = 16000;
let index_arr: Vec<usize> = {
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
(0..n_accum)
.map(|i| {
let mut h = DefaultHasher::new();
i.hash(&mut h);
(h.finish() as usize) % n_accum
})
.collect()
};
let bins: Vec<u16> = (0..n).map(|i| (i % 256) as u16).collect();
let grads: Vec<f32> = (0..n).map(|i| i as f32 * 0.001).collect();
let mut flat = vec![0.0_f32; 256 * 5];
let mut flat_c = vec![0_u32; 256 * 5];
let n_reps = 100000;
let t = Instant::now();
for _rep in 0..n_reps {
for &i in &index_arr {
let bin = bins[i] as usize;
let fold = i % 5;
let slot = bin * 5 + fold;
flat[slot] += grads[i];
flat_c[slot] += 1;
}
black_box(&flat);
}
let accum_us = t.elapsed().as_micros() as f64 / n_reps as f64;
eprintln!(
"scatter-add (n={}): {:.1}μs ({:.1}ns/sample)",
n_accum,
accum_us,
accum_us * 1000.0 / n_accum as f64
);
let pool = rayon::ThreadPoolBuilder::new().num_threads(8).build().unwrap();
let n_reps = 100000;
let counter = std::sync::atomic::AtomicUsize::new(0);
let t = Instant::now();
for _rep in 0..n_reps {
pool.scope(|s| {
for _i in 0..8 {
s.spawn(|_| {
counter.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
});
}
});
}
let scope_us = t.elapsed().as_micros() as f64 / n_reps as f64;
eprintln!(
"rayon scope (8 tasks): {:.1}μs (counter={})",
scope_us,
counter.load(std::sync::atomic::Ordering::Relaxed)
);
let t = Instant::now();
for _rep in 0..n_reps {
pool.scope(|_s| {});
}
let empty_scope_us = t.elapsed().as_micros() as f64 / n_reps as f64;
eprintln!("rayon scope (empty): {:.1}μs", empty_scope_us);
let splits = 198;
let total_hist_samples = 16512 * 8; let total_pivot_samples = 16512 * 9; let total_loss_samples = 16512 * 9;
let estimated_hist_ms = total_hist_samples as f64 * accum_us / n_accum as f64 / 1000.0 / 8.0; let estimated_pivot_ms = total_pivot_samples as f64 * pivot_us / n_pivot as f64 / 1000.0;
let estimated_loss_ms = total_loss_samples as f64 * loss_loop_us / node_size as f64 / 1000.0;
let estimated_rayon_ms = splits as f64 * 2.0 * scope_us / 1000.0;
let estimated_subtraction_ms = splits as f64 * 8.0 * 256.0 * 15.0 * 1.0 / 1_000_000.0;
eprintln!("\n--- Estimated per-tree breakdown (198 splits) ---");
eprintln!("histogram build: {:.2}ms", estimated_hist_ms);
eprintln!("pivot: {:.2}ms", estimated_pivot_ms);
eprintln!("loss_decr: {:.2}ms", estimated_loss_ms);
eprintln!(
"rayon overhead: {:.2}ms (2 scopes/split × {:.1}μs)",
estimated_rayon_ms, scope_us
);
eprintln!("subtraction: {:.2}ms", estimated_subtraction_ms);
let total_estimated =
estimated_hist_ms + estimated_pivot_ms + estimated_loss_ms + estimated_rayon_ms + estimated_subtraction_ms;
eprintln!("TOTAL estimated: {:.2}ms (actual ~42ms)", total_estimated);
eprintln!(
"unaccounted: {:.2}ms ({:.0}%)",
42.0 - total_estimated,
(42.0 - total_estimated) / 42.0 * 100.0
);
}