use burn::{
prelude::Backend,
tensor::{Tensor, TensorData},
};
use fast_umap::{
model::{UMAPModel, UMAPModelConfigBuilder},
normalizer::normalize as layer_normalize,
utils::{
convert_tensor_to_vector, convert_vector_to_tensor, generate_test_data, normalize_data,
normalize_tensor,
},
};
use std::{
fs,
io::Write,
path::Path,
time::{Duration, Instant},
};
type CpuBackend = burn::backend::NdArray<f32>;
type GpuBackend =
burn::backend::wgpu::CubeBackend<cubecl::wgpu::WgpuRuntime, f32, i32, u32>;
fn detect_cpu() -> String {
#[cfg(target_os = "macos")]
{
if let Ok(out) = std::process::Command::new("sysctl")
.args(["-n", "machdep.cpu.brand_string"])
.output()
{
let s = String::from_utf8_lossy(&out.stdout).trim().to_string();
if !s.is_empty() {
return s;
}
}
}
#[cfg(target_os = "linux")]
{
if let Ok(text) = fs::read_to_string("/proc/cpuinfo") {
for line in text.lines() {
if line.starts_with("model name") {
if let Some(v) = line.split(':').nth(1) {
return v.trim().to_string();
}
}
}
let mut hw = String::new();
let mut implementer = String::new();
for line in text.lines() {
if line.starts_with("Hardware") {
if let Some(v) = line.split(':').nth(1) {
let s = v.trim().to_string();
if !s.is_empty() { hw = s; }
}
}
if implementer.is_empty() && line.starts_with("CPU implementer") {
if let Some(v) = line.split(':').nth(1) {
implementer = v.trim().to_string();
}
}
}
if !hw.is_empty() {
return hw;
}
if !implementer.is_empty() {
let arch = std::process::Command::new("uname")
.arg("-m").output()
.map(|o| String::from_utf8_lossy(&o.stdout).trim().to_string())
.unwrap_or_else(|_| "aarch64".into());
let vendor = match implementer.as_str() {
"0x41" => "ARM Cortex",
"0x51" => "Qualcomm",
"0x61" => "Apple Silicon",
"0x53" => "Samsung",
"0xc0" => "Ampere",
other => return format!("ARM64 (implementer {other})"),
};
return format!("{vendor} ({arch})");
}
}
if let Ok(out) = std::process::Command::new("lscpu").output() {
for line in String::from_utf8_lossy(&out.stdout).lines() {
if line.starts_with("Model name") {
if let Some(v) = line.split(':').nth(1) {
let s = v.trim().to_string();
if !s.is_empty() {
return s;
}
}
}
}
}
}
#[cfg(target_os = "windows")]
{
if let Ok(out) = std::process::Command::new("wmic")
.args(["cpu", "get", "Name", "/value"])
.output()
{
for line in String::from_utf8_lossy(&out.stdout).lines() {
if let Some(v) = line.strip_prefix("Name=") {
return v.trim().to_string();
}
}
}
}
"Unknown CPU".to_string()
}
fn detect_gpu_name() -> Option<String> {
#[cfg(target_os = "macos")]
{
if let Ok(out) = std::process::Command::new("system_profiler")
.arg("SPDisplaysDataType")
.output()
{
let text = String::from_utf8_lossy(&out.stdout);
for line in text.lines() {
let line = line.trim();
if let Some(v) = line.strip_prefix("Chipset Model:") {
return Some(v.trim().to_string());
}
}
}
}
#[cfg(target_os = "linux")]
{
if let Ok(out) = std::process::Command::new("nvidia-smi")
.args(["--query-gpu=name", "--format=csv,noheader"])
.output()
{
let s = String::from_utf8_lossy(&out.stdout).trim().to_string();
if !s.is_empty() && out.status.success() {
return Some(s.lines().next().unwrap_or(&s).trim().to_string());
}
}
if let Ok(out) = std::process::Command::new("lspci").output() {
for line in String::from_utf8_lossy(&out.stdout).lines() {
let l = line.to_lowercase();
if l.contains("vga") || l.contains("3d") || l.contains("display") {
if let Some(idx) = line.find(':') {
if let Some(idx2) = line[idx + 1..].find(':') {
return Some(line[idx + 1 + idx2 + 1..].trim().to_string());
}
}
}
}
}
}
#[cfg(target_os = "windows")]
{
if let Ok(out) = std::process::Command::new("wmic")
.args(["path", "win32_VideoController", "get", "Name", "/value"])
.output()
{
for line in String::from_utf8_lossy(&out.stdout).lines() {
if let Some(v) = line.strip_prefix("Name=") {
let name = v.trim().to_string();
if !name.is_empty() {
return Some(name);
}
}
}
}
}
None
}
fn gpu_available() -> bool {
std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
let device = burn::backend::wgpu::WgpuDevice::default();
let t: Tensor<GpuBackend, 1> =
Tensor::from_data(TensorData::new(vec![1.0f32], [1]), &device);
let _ = t.to_data().to_vec::<f32>().unwrap();
}))
.is_ok()
}
fn sanitize(s: &str) -> String {
let mut out = String::new();
let mut prev_us = true; for c in s.chars() {
let mapped = match c {
'a'..='z' | '0'..='9' | '-' => c,
'A'..='Z' => c.to_ascii_lowercase(),
_ => '_',
};
if mapped == '_' {
if !prev_us {
out.push('_');
prev_us = true;
}
} else {
out.push(mapped);
prev_us = false;
}
}
while out.ends_with('_') {
out.pop();
}
out.chars().take(50).collect()
}
fn measure<F: FnMut()>(mut f: F, warmup: usize, iters: usize) -> (f64, f64, f64) {
for _ in 0..warmup {
f();
}
let mut times = Vec::with_capacity(iters);
for _ in 0..iters {
let t0 = Instant::now();
f();
times.push(t0.elapsed());
}
times.sort();
let mean = times.iter().sum::<Duration>() / times.len() as u32;
let min = times[0];
let max = *times.last().unwrap();
(
mean.as_nanos() as f64 / 1_000.0,
min.as_nanos() as f64 / 1_000.0,
max.as_nanos() as f64 / 1_000.0,
)
}
#[derive(Clone)]
struct Entry {
label: String,
mean_us: f64,
lo_us: f64,
hi_us: f64,
}
#[derive(Clone)]
struct Group {
name: String,
key: String,
entries: Vec<Entry>,
}
fn build_model<B: Backend>(
input: usize,
hidden: &[usize],
output: usize,
device: &B::Device,
) -> UMAPModel<B> {
let cfg = UMAPModelConfigBuilder::default()
.input_size(input)
.hidden_sizes(hidden.to_vec())
.output_size(output)
.build()
.unwrap();
UMAPModel::new(&cfg, device)
}
fn run_cpu_benchmarks() -> Vec<Group> {
let device = burn::backend::ndarray::NdArrayDevice::Cpu;
let mut groups = Vec::new();
{
let mut entries = Vec::new();
for (s, f) in [(100, 10), (500, 30), (1_000, 50), (5_000, 100)] {
let iters = if s * f > 100_000 { 10 } else { 30 };
let (mean, lo, hi) = measure(
|| {
let mut data: Vec<f64> = generate_test_data(s, f);
normalize_data(&mut data, s, f);
},
3,
iters,
);
entries.push(Entry {
label: format!("{s}×{f}"),
mean_us: mean,
lo_us: lo,
hi_us: hi,
});
}
groups.push(Group { name: "normalize_data".into(), key: "normalize_data".into(), entries });
}
{
let mut entries = Vec::new();
for (s, f) in [(100, 10), (500, 30), (1_000, 50), (5_000, 100)] {
let (mean, lo, hi) = measure(|| { let _: Vec<f32> = generate_test_data(s, f); }, 5, 50);
entries.push(Entry { label: format!("{s}×{f}"), mean_us: mean, lo_us: lo, hi_us: hi });
}
groups.push(Group { name: "generate_test_data".into(), key: "generate_test_data".into(), entries });
}
{
let mut entries = Vec::new();
for (s, f) in [(100, 10), (500, 30), (1_000, 50)] {
let data: Vec<f32> = generate_test_data(s, f);
let (mean, lo, hi) = measure(
|| {
let t: Tensor<CpuBackend, 2> =
convert_vector_to_tensor(data.clone(), s, f, &device);
let _: Vec<Vec<f64>> = convert_tensor_to_vector(t);
},
3,
30,
);
entries.push(Entry { label: format!("{s}×{f}"), mean_us: mean, lo_us: lo, hi_us: hi });
}
groups.push(Group { name: "tensor_convert (Vec↔Tensor)".into(), key: "tensor_convert".into(), entries });
}
{
let configs: &[(&str, usize, &[usize], usize, usize)] = &[
("16s×10f [32]→2", 16, &[32], 10, 2),
("64s×50f [64]→2", 64, &[64], 50, 2),
("128s×50f [128]→2", 128, &[128], 50, 2),
("64s×100f [128,64]→3", 64, &[128, 64], 100, 3),
("256s×100f [256,128]→2", 256, &[256, 128],100, 2),
];
let mut entries = Vec::new();
for &(label, s, hidden, f, out) in configs {
let model: UMAPModel<CpuBackend> = build_model(f, hidden, out, &device);
let data: Vec<f32> = generate_test_data(s, f);
let input: Tensor<CpuBackend, 2> =
Tensor::from_data(TensorData::new(data, [s, f]), &device);
let iters = if s > 100 { 20 } else { 50 };
let (mean, lo, hi) = measure(|| { let _ = model.forward(input.clone()); }, 3, iters);
entries.push(Entry { label: label.into(), mean_us: mean, lo_us: lo, hi_us: hi });
}
groups.push(Group { name: "model_forward (CPU / NdArray)".into(), key: "model_forward".into(), entries });
}
{
let mut entries = Vec::new();
for n in [64usize, 512, 4_096, 32_768] {
let data: Vec<f32> = generate_test_data(n, 1);
let (mean, lo, hi) = measure(
|| {
let t: Tensor<CpuBackend, 1> =
Tensor::from_data(TensorData::new(data.clone(), [n]), &device);
let _ = normalize_tensor(t);
},
5,
50,
);
entries.push(Entry { label: format!("n={n}"), mean_us: mean, lo_us: lo, hi_us: hi });
}
groups.push(Group { name: "normalize_tensor (1-D min-max)".into(), key: "normalize_tensor".into(), entries });
}
{
let mut entries = Vec::new();
for (s, f) in [(32, 16), (128, 64), (512, 128), (1_000, 256)] {
let data: Vec<f32> = generate_test_data(s, f);
let t: Tensor<CpuBackend, 2> =
Tensor::from_data(TensorData::new(data, [s, f]), &device);
let (mean, lo, hi) =
measure(|| { let _ = layer_normalize(t.clone()); }, 3, 30);
entries.push(Entry { label: format!("{s}×{f}"), mean_us: mean, lo_us: lo, hi_us: hi });
}
groups.push(Group { name: "layer_normalize (2-D)".into(), key: "layer_normalize".into(), entries });
}
groups
}
fn run_gpu_benchmarks() -> Vec<Group> {
let device = burn::backend::wgpu::WgpuDevice::default();
let mut groups = Vec::new();
{
let configs: &[(&str, usize, &[usize], usize, usize)] = &[
("16s×10f [32]→2", 16, &[32], 10, 2),
("64s×50f [64]→2", 64, &[64], 50, 2),
("128s×50f [128]→2", 128, &[128], 50, 2),
("64s×100f [128,64]→3", 64, &[128, 64], 100, 3),
("256s×100f [256,128]→2", 256, &[256, 128],100, 2),
("512s×100f [256,128]→2", 512, &[256, 128],100, 2),
];
let mut entries = Vec::new();
for &(label, s, hidden, f, out) in configs {
let model: UMAPModel<GpuBackend> = build_model(f, hidden, out, &device);
let data: Vec<f32> = generate_test_data(s, f);
let input: Tensor<GpuBackend, 2> =
Tensor::from_data(TensorData::new(data, [s, f]), &device);
let iters = 30;
let (mean, lo, hi) = measure(
|| {
let out = model.forward(input.clone());
let _ = out.to_data().to_vec::<f32>().unwrap();
},
5,
iters,
);
entries.push(Entry { label: label.into(), mean_us: mean, lo_us: lo, hi_us: hi });
}
groups.push(Group { name: "model_forward (GPU / WGPU)".into(), key: "model_forward".into(), entries });
}
{
let mut entries = Vec::new();
for n in [512usize, 4_096, 32_768, 262_144] {
let data: Vec<f32> = generate_test_data(n, 1);
let (mean, lo, hi) = measure(
|| {
let t: Tensor<GpuBackend, 1> =
Tensor::from_data(TensorData::new(data.clone(), [n]), &device);
let out = normalize_tensor(t);
let _ = out.to_data().to_vec::<f32>().unwrap();
},
5,
30,
);
entries.push(Entry { label: format!("n={n}"), mean_us: mean, lo_us: lo, hi_us: hi });
}
groups.push(Group { name: "normalize_tensor (GPU / WGPU)".into(), key: "normalize_tensor".into(), entries });
}
{
let mut entries = Vec::new();
for (s, f) in [(128, 64), (512, 128), (1_000, 256), (4_000, 512)] {
let data: Vec<f32> = generate_test_data(s, f);
let (mean, lo, hi) = measure(
|| {
let t: Tensor<GpuBackend, 2> =
Tensor::from_data(TensorData::new(data.clone(), [s, f]), &device);
let out = layer_normalize(t);
let _ = out.to_data().to_vec::<f32>().unwrap();
},
5,
20,
);
entries.push(Entry { label: format!("{s}×{f}"), mean_us: mean, lo_us: lo, hi_us: hi });
}
groups.push(Group { name: "layer_normalize (GPU / WGPU)".into(), key: "layer_normalize".into(), entries });
}
groups
}
fn fmt_us(us: f64) -> String {
if us < 1.0 {
format!("{:.2} ns", us * 1_000.0)
} else if us < 1_000.0 {
format!("{:.2} µs", us)
} else if us < 1_000_000.0 {
format!("{:.2} ms", us / 1_000.0)
} else {
format!("{:.3} s", us / 1_000_000.0)
}
}
fn write_markdown(path: &str, hw_label: &str, hw_detail: &str, groups: &[Group]) {
let mut md = String::new();
md.push_str(&format!("# fast-umap — Benchmark Results\n\n"));
md.push_str(&format!("> **Hardware:** {hw_label} \n"));
md.push_str(&format!("> **Detail:** {hw_detail} \n"));
md.push_str("> **Backend:** burn 0.20.1 \n");
md.push_str("> **Reproduce:** `cargo run --release --bin bench_report`\n\n");
md.push_str("\n\n");
md.push_str("All times are **[min, mean, max]** over multiple timed iterations.\n\n");
md.push_str("---\n\n");
for g in groups {
md.push_str(&format!("## `{}`\n\n", g.name));
md.push_str("| Input | Min | **Mean** | Max |\n");
md.push_str("|-------|-----|----------|-----|\n");
for e in &g.entries {
md.push_str(&format!(
"| `{}` | {} | **{}** | {} |\n",
e.label,
fmt_us(e.lo_us),
fmt_us(e.mean_us),
fmt_us(e.hi_us)
));
}
md.push('\n');
}
fs::create_dir_all(Path::new(path).parent().unwrap()).unwrap();
fs::write(path, &md).unwrap();
println!(" Wrote {path}");
}
fn write_svg(path: &str, hw_label: &str, groups: &[Group]) {
const COLS: usize = 2;
const SVG_W: f64 = 960.0;
const TITLE_H: f64 = 48.0;
const PANEL_W: f64 = SVG_W / COLS as f64; const PANEL_H: f64 = 210.0;
const CHART_LEFT: f64 = 170.0;
const CHART_RIGHT: f64 = 30.0;
const CHART_TOP: f64 = 36.0;
const CHART_BOTTOM: f64 = 34.0;
let rows = (groups.len() + COLS - 1) / COLS;
let svg_h = TITLE_H + rows as f64 * PANEL_H + 20.0;
let colours = [
"4C78A8", "F58518", "E45756", "72B7B2", "54A24B", "B279A2",
"FF9DA7", "9D755D",
];
let log_ticks = |lo: f64, hi: f64| -> Vec<f64> {
let mut ticks = Vec::new();
let mut p = lo.log10().floor() as i32;
while 10f64.powi(p) <= hi * 1.1 {
for &m in &[1.0, 2.0, 5.0] {
let v = m * 10f64.powi(p);
if v >= lo * 0.8 && v <= hi * 1.2 {
ticks.push(v);
}
}
p += 1;
}
ticks.sort_by(|a, b| a.partial_cmp(b).unwrap());
ticks.dedup_by(|a, b| (*a - *b).abs() < 1e-12);
ticks
};
let mut out: Vec<u8> = Vec::new();
let w = &mut out;
writeln!(w, "<svg xmlns=\"http://www.w3.org/2000/svg\" width=\"{SVG_W}\" height=\"{svg_h}\" font-family=\"monospace,Arial,sans-serif\">").unwrap();
writeln!(w, "<rect width=\"{SVG_W}\" height=\"{svg_h}\" fill=\"#FAFAFA\"/>").unwrap();
writeln!(w, "<text x=\"{}\" y=\"30\" text-anchor=\"middle\" font-size=\"15\" font-weight=\"bold\" fill=\"#222\">{hw_label} \u{2014} fast-umap Benchmark Results</text>",
SVG_W / 2.0).unwrap();
for (idx, g) in groups.iter().enumerate() {
let col = (idx % COLS) as f64;
let row = (idx / COLS) as f64;
let px = col * PANEL_W;
let py = TITLE_H + row * PANEL_H;
let hex = colours[idx % colours.len()]; let colour = format!("#{hex}");
let vals: Vec<f64> = g.entries.iter().map(|e| e.mean_us).collect();
let lo_val = vals.iter().cloned().fold(f64::INFINITY, f64::min);
let hi_val = vals.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
if lo_val <= 0.0 || hi_val <= 0.0 {
continue;
}
let chart_w = PANEL_W - CHART_LEFT - CHART_RIGHT;
let chart_h = PANEL_H - CHART_TOP - CHART_BOTTOM;
let bar_count = g.entries.len() as f64;
let bar_gap = 5.0_f64;
let bar_h = ((chart_h - bar_gap * (bar_count - 1.0)) / bar_count).max(10.0);
let lo_log = (lo_val * 0.55).log10();
let hi_log = (hi_val * 1.45).log10();
let x_of = |v: f64| -> f64 {
(v.log10() - lo_log) / (hi_log - lo_log) * chart_w
};
let cx = px + CHART_LEFT;
let cy = py + CHART_TOP;
writeln!(w,
"<rect x=\"{:.1}\" y=\"{:.1}\" width=\"{:.1}\" height=\"{:.1}\" rx=\"6\" fill=\"white\" stroke=\"#DDD\" stroke-width=\"1\"/>",
px + 4.0, py + 2.0, PANEL_W - 8.0, PANEL_H - 4.0
).unwrap();
writeln!(w,
"<text x=\"{:.1}\" y=\"{:.1}\" text-anchor=\"middle\" font-size=\"10.5\" font-weight=\"bold\" fill=\"#333\">{}</text>",
px + PANEL_W / 2.0, py + 20.0, g.name
).unwrap();
let ticks = log_ticks(lo_val * 0.55, hi_val * 1.45);
for t in &ticks {
let tx = cx + x_of(*t);
writeln!(w,
"<line x1=\"{tx:.1}\" y1=\"{cy:.1}\" x2=\"{tx:.1}\" y2=\"{:.1}\" stroke=\"#EEE\" stroke-width=\"1\"/>",
cy + chart_h
).unwrap();
writeln!(w,
"<text x=\"{tx:.1}\" y=\"{:.1}\" text-anchor=\"middle\" font-size=\"8\" fill=\"#888\">{}</text>",
cy + chart_h + 12.0, fmt_us(*t)
).unwrap();
}
for (i, e) in g.entries.iter().enumerate() {
let by = cy + i as f64 * (bar_h + bar_gap);
let bw = x_of(e.mean_us);
let lo_x = cx + x_of(e.lo_us.max(lo_val * 0.56));
let hi_x = cx + x_of(e.hi_us);
let mid_y = by + bar_h / 2.0;
let lbl_y = mid_y + 3.5;
writeln!(w,
"<rect x=\"{cx:.1}\" y=\"{by:.1}\" width=\"{bw:.1}\" height=\"{bar_h:.1}\" rx=\"2\" fill=\"{colour}\" opacity=\"0.80\"/>",
).unwrap();
writeln!(w,
"<line x1=\"{lo_x:.1}\" y1=\"{mid_y:.1}\" x2=\"{hi_x:.1}\" y2=\"{mid_y:.1}\" stroke=\"#555\" stroke-width=\"1.2\"/>",
).unwrap();
writeln!(w,
"<line x1=\"{lo_x:.1}\" y1=\"{:.1}\" x2=\"{lo_x:.1}\" y2=\"{:.1}\" stroke=\"#555\" stroke-width=\"1.2\"/>",
mid_y - 3.0, mid_y + 3.0
).unwrap();
writeln!(w,
"<line x1=\"{hi_x:.1}\" y1=\"{:.1}\" x2=\"{hi_x:.1}\" y2=\"{:.1}\" stroke=\"#555\" stroke-width=\"1.2\"/>",
mid_y - 3.0, mid_y + 3.0
).unwrap();
writeln!(w,
"<text x=\"{:.1}\" y=\"{lbl_y:.1}\" text-anchor=\"end\" font-size=\"8.5\" fill=\"#333\">{}</text>",
cx - 4.0, e.label
).unwrap();
writeln!(w,
"<text x=\"{:.1}\" y=\"{lbl_y:.1}\" font-size=\"8\" fill=\"#555\">{}</text>",
cx + bw + 4.0, fmt_us(e.mean_us)
).unwrap();
}
writeln!(w,
"<line x1=\"{cx:.1}\" y1=\"{:.1}\" x2=\"{:.1}\" y2=\"{:.1}\" stroke=\"#BBB\" stroke-width=\"1\"/>",
cy + chart_h, cx + chart_w, cy + chart_h
).unwrap();
}
writeln!(w, "</svg>").unwrap();
let s = String::from_utf8(out).expect("SVG is valid UTF-8");
fs::create_dir_all(Path::new(path).parent().unwrap()).unwrap();
fs::write(path, &s).unwrap();
println!(" Wrote {path}");
}
#[derive(Clone)]
struct Pair {
label: String,
cpu_us: f64,
gpu_us: f64,
}
impl Pair {
fn speedup(&self) -> f64 {
self.cpu_us / self.gpu_us
}
}
#[derive(Clone)]
struct CompGroup {
key: String,
cpu_name: String,
gpu_name: String,
pairs: Vec<Pair>,
}
fn build_comparison(cpu: &[Group], gpu: &[Group]) -> Vec<CompGroup> {
let mut result = Vec::new();
for gpu_g in gpu {
if let Some(cpu_g) = cpu.iter().find(|g| g.key == gpu_g.key) {
let pairs: Vec<Pair> = gpu_g
.entries
.iter()
.filter_map(|ge| {
cpu_g
.entries
.iter()
.find(|ce| ce.label == ge.label)
.map(|ce| Pair {
label: ge.label.clone(),
cpu_us: ce.mean_us,
gpu_us: ge.mean_us,
})
})
.collect();
if !pairs.is_empty() {
result.push(CompGroup {
key: gpu_g.key.clone(),
cpu_name: cpu_g.name.clone(),
gpu_name: gpu_g.name.clone(),
pairs,
});
}
}
}
result
}
fn write_comparison_markdown(
path: &str,
cpu_hw: &str,
gpu_hw: &str,
comp: &[CompGroup],
) {
let mut md = String::new();
md.push_str("# fast-umap — CPU vs GPU Comparison\n\n");
md.push_str(&format!("> **CPU:** {cpu_hw} \n"));
md.push_str(&format!("> **GPU:** {gpu_hw} (WGPU / Metal) \n"));
md.push_str("> **Backend:** burn 0.20.1 · NdArray vs WGPU \n");
md.push_str("> **Reproduce:** `cargo run --release --bin bench_report`\n\n");
md.push_str("\n\n");
md.push_str("Speedup = CPU mean ÷ GPU mean. \n");
md.push_str("> 1× = GPU faster, < 1× = CPU faster (e.g. small tensors where dispatch overhead dominates).\n\n");
md.push_str("---\n\n");
for cg in comp {
md.push_str(&format!("## `{}`\n\n", cg.key));
md.push_str(&format!(
"| Input | {} | {} | Speedup |\n",
cg.cpu_name, cg.gpu_name
));
md.push_str("|-------|");
md.push_str(&"-".repeat(cg.cpu_name.len() + 2));
md.push_str("|");
md.push_str(&"-".repeat(cg.gpu_name.len() + 2));
md.push_str("|----------|\n");
for p in &cg.pairs {
let s = p.speedup();
let speedup_cell = if s >= 1.5 {
format!("**{s:.2}×** 🚀")
} else if s >= 1.0 {
format!("**{s:.2}×**")
} else {
format!("{s:.2}× *(CPU faster)*")
};
md.push_str(&format!(
"| `{}` | {} | {} | {} |\n",
p.label,
fmt_us(p.cpu_us),
fmt_us(p.gpu_us),
speedup_cell
));
}
md.push('\n');
}
fs::create_dir_all(Path::new(path).parent().unwrap()).unwrap();
fs::write(path, &md).unwrap();
println!(" Wrote {path}");
}
fn write_comparison_svg(path: &str, cpu_hw: &str, gpu_hw: &str, comp: &[CompGroup]) {
if comp.is_empty() {
return;
}
const SVG_W: f64 = 960.0;
const TITLE_H: f64 = 52.0;
const LEGEND_H: f64 = 22.0;
const CHART_LEFT: f64 = 185.0;
const CHART_RIGHT: f64 = 72.0; const CHART_TOP: f64 = 36.0; const CHART_BOTTOM: f64 = 32.0; const BAR_H: f64 = 11.0; const PAIR_GAP: f64 = 2.0; const ENTRY_GAP: f64 = 10.0; const PANEL_PAD: f64 = 8.0;
let cpu_colour = "4C78A8"; let gpu_colour = "F58518";
let chart_w = SVG_W - CHART_LEFT - CHART_RIGHT;
struct Panel<'a> {
cg: &'a CompGroup,
chart_h: f64,
panel_h: f64,
}
let panels: Vec<Panel> = comp
.iter()
.map(|cg| {
let n = cg.pairs.len() as f64;
let ch = n * (2.0 * BAR_H + PAIR_GAP) + (n - 1.0) * ENTRY_GAP;
let ph = CHART_TOP + LEGEND_H + ch + CHART_BOTTOM + 2.0 * PANEL_PAD;
Panel { cg, chart_h: ch, panel_h: ph }
})
.collect();
let total_h: f64 = TITLE_H + panels.iter().map(|p| p.panel_h + 10.0).sum::<f64>() + 10.0;
let log_ticks = |lo: f64, hi: f64| -> Vec<f64> {
let mut ticks = Vec::new();
let mut p = lo.log10().floor() as i32;
while 10f64.powi(p) <= hi * 1.1 {
for &m in &[1.0, 2.0, 5.0] {
let v = m * 10f64.powi(p);
if v >= lo * 0.8 && v <= hi * 1.2 {
ticks.push(v);
}
}
p += 1;
}
ticks.sort_by(|a, b| a.partial_cmp(b).unwrap());
ticks.dedup_by(|a, b| (*a - *b).abs() < 1e-12);
ticks
};
let mut out: Vec<u8> = Vec::new();
let w = &mut out;
writeln!(w, "<svg xmlns=\"http://www.w3.org/2000/svg\" width=\"{SVG_W}\" height=\"{total_h:.1}\" font-family=\"monospace,Arial,sans-serif\">").unwrap();
writeln!(w, "<rect width=\"{SVG_W}\" height=\"{total_h:.1}\" fill=\"#FAFAFA\"/>").unwrap();
writeln!(w,
"<text x=\"{:.1}\" y=\"22\" text-anchor=\"middle\" font-size=\"14\" font-weight=\"bold\" fill=\"#222\">{cpu_hw} vs {gpu_hw} \u{2014} CPU/GPU Comparison</text>",
SVG_W / 2.0
).unwrap();
let lx = SVG_W / 2.0 - 100.0;
writeln!(w, "<rect x=\"{lx:.1}\" y=\"30\" width=\"12\" height=\"10\" fill=\"#{cpu_colour}\" rx=\"2\"/>").unwrap();
writeln!(w, "<text x=\"{:.1}\" y=\"39\" font-size=\"9\" fill=\"#333\">CPU (NdArray)</text>", lx + 15.0).unwrap();
writeln!(w, "<rect x=\"{:.1}\" y=\"30\" width=\"12\" height=\"10\" fill=\"#{gpu_colour}\" rx=\"2\"/>", lx + 105.0).unwrap();
writeln!(w, "<text x=\"{:.1}\" y=\"39\" font-size=\"9\" fill=\"#333\">GPU (WGPU/Metal)</text>", lx + 120.0).unwrap();
let mut cur_y = TITLE_H;
for panel in &panels {
let cg = panel.cg;
let py = cur_y;
let ph = panel.panel_h;
let all_vals: Vec<f64> = cg
.pairs
.iter()
.flat_map(|p| [p.cpu_us, p.gpu_us])
.collect();
let lo_val = all_vals.iter().cloned().fold(f64::INFINITY, f64::min);
let hi_val = all_vals.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
let lo_log = (lo_val * 0.45).log10();
let hi_log = (hi_val * 1.55).log10();
let x_of = |v: f64| -> f64 {
(v.log10() - lo_log) / (hi_log - lo_log) * chart_w
};
let cx = CHART_LEFT;
let cy = py + PANEL_PAD + CHART_TOP + LEGEND_H;
writeln!(w,
"<rect x=\"4\" y=\"{py:.1}\" width=\"{:.1}\" height=\"{ph:.1}\" rx=\"6\" fill=\"white\" stroke=\"#DDD\" stroke-width=\"1\"/>",
SVG_W - 8.0
).unwrap();
writeln!(w,
"<text x=\"{:.1}\" y=\"{:.1}\" text-anchor=\"middle\" font-size=\"11\" font-weight=\"bold\" fill=\"#333\">{}</text>",
SVG_W / 2.0,
py + PANEL_PAD + 20.0,
cg.key
).unwrap();
let ticks = log_ticks(lo_val * 0.45, hi_val * 1.55);
for t in &ticks {
let tx = cx + x_of(*t);
writeln!(w,
"<line x1=\"{tx:.1}\" y1=\"{cy:.1}\" x2=\"{tx:.1}\" y2=\"{:.1}\" stroke=\"#EBEBEB\" stroke-width=\"1\"/>",
cy + panel.chart_h
).unwrap();
writeln!(w,
"<text x=\"{tx:.1}\" y=\"{:.1}\" text-anchor=\"middle\" font-size=\"8\" fill=\"#999\">{}</text>",
cy + panel.chart_h + 12.0, fmt_us(*t)
).unwrap();
}
for (i, p) in cg.pairs.iter().enumerate() {
let pair_h = 2.0 * BAR_H + PAIR_GAP;
let by = cy + i as f64 * (pair_h + ENTRY_GAP);
let cpu_bw = x_of(p.cpu_us);
let cpu_lbl_y = by + BAR_H / 2.0 + 3.5;
writeln!(w,
"<rect x=\"{cx:.1}\" y=\"{by:.1}\" width=\"{cpu_bw:.1}\" height=\"{BAR_H:.1}\" rx=\"2\" fill=\"#{cpu_colour}\" opacity=\"0.85\"/>",
).unwrap();
writeln!(w,
"<text x=\"{:.1}\" y=\"{cpu_lbl_y:.1}\" font-size=\"7.5\" fill=\"#555\">{}</text>",
cx + cpu_bw + 4.0, fmt_us(p.cpu_us)
).unwrap();
let gpu_by = by + BAR_H + PAIR_GAP;
let gpu_bw = x_of(p.gpu_us);
let gpu_lbl_y = gpu_by + BAR_H / 2.0 + 3.5;
writeln!(w,
"<rect x=\"{cx:.1}\" y=\"{gpu_by:.1}\" width=\"{gpu_bw:.1}\" height=\"{BAR_H:.1}\" rx=\"2\" fill=\"#{gpu_colour}\" opacity=\"0.85\"/>",
).unwrap();
writeln!(w,
"<text x=\"{:.1}\" y=\"{gpu_lbl_y:.1}\" font-size=\"7.5\" fill=\"#555\">{}</text>",
cx + gpu_bw + 4.0, fmt_us(p.gpu_us)
).unwrap();
let speedup = p.speedup();
let (sp_text, sp_fill) = if speedup >= 2.0 {
(format!("{speedup:.1}\u{00D7} \u{1F680}"), "#157a1a")
} else if speedup >= 1.0 {
(format!("{speedup:.2}\u{00D7}"), "#2a7ec7")
} else {
(format!("{speedup:.2}\u{00D7}"), "#888")
};
let sp_x = SVG_W - CHART_RIGHT + 4.0;
let sp_y = by + pair_h / 2.0 + 3.5;
writeln!(w,
"<text x=\"{sp_x:.1}\" y=\"{sp_y:.1}\" font-size=\"8.5\" font-weight=\"bold\" fill=\"{sp_fill}\">{sp_text}</text>",
).unwrap();
writeln!(w,
"<text x=\"{:.1}\" y=\"{sp_y:.1}\" text-anchor=\"end\" font-size=\"8.5\" fill=\"#333\">{}</text>",
cx - 4.0, p.label
).unwrap();
}
writeln!(w,
"<line x1=\"{cx:.1}\" y1=\"{:.1}\" x2=\"{:.1}\" y2=\"{:.1}\" stroke=\"#BBB\" stroke-width=\"1\"/>",
cy + panel.chart_h, cx + chart_w, cy + panel.chart_h
).unwrap();
cur_y += ph + 10.0;
}
writeln!(w, "</svg>").unwrap();
let svg_str = String::from_utf8(out).expect("SVG is valid UTF-8");
fs::create_dir_all(Path::new(path).parent().unwrap()).unwrap();
fs::write(path, &svg_str).unwrap();
println!(" Wrote {path}");
}
fn main() {
let cpu_name = detect_cpu();
let cpu_safe = sanitize(&cpu_name);
println!("\n╔══════════════════════════════════════════════════════════╗");
println!("║ fast-umap benchmark reporter ║");
println!("╚══════════════════════════════════════════════════════════╝\n");
println!("CPU : {cpu_name}");
let has_gpu = gpu_available();
let gpu_name = if has_gpu {
detect_gpu_name().unwrap_or_else(|| "GPU (WGPU)".to_string())
} else {
String::new()
};
if has_gpu {
println!("GPU : {gpu_name} (WGPU/Metal/Vulkan)\n");
} else {
println!("GPU : not available or not usable — skipping GPU benchmarks\n");
}
let results_dir = "benches/results";
println!("━━━ Running CPU benchmarks (NdArray backend) ━━━");
let cpu_groups = run_cpu_benchmarks();
for g in &cpu_groups {
println!("\n [{}]", g.name);
for e in &g.entries {
println!(" {:40} {}", e.label, fmt_us(e.mean_us));
}
}
println!("\n━━━ Saving CPU results ━━━");
let cpu_hw_label = format!("CPU: {cpu_name}");
let cpu_md = format!("{results_dir}/cpu_{cpu_safe}.md");
let cpu_svg = format!("{results_dir}/cpu_{cpu_safe}.svg");
write_markdown(&cpu_md, &cpu_hw_label, &cpu_name, &cpu_groups);
write_svg(&cpu_svg, &cpu_hw_label, &cpu_groups);
let generic_md = format!("{results_dir}/benchmark_results.md");
let generic_svg = format!("{results_dir}/benchmark_results.svg");
fs::copy(&cpu_md, &generic_md).unwrap();
println!(" Wrote {generic_md} (copy of latest CPU run)");
write_svg(&generic_svg, &cpu_hw_label, &cpu_groups);
if has_gpu {
println!("\n━━━ Running GPU benchmarks (WGPU backend) ━━━");
let gpu_groups = run_gpu_benchmarks();
for g in &gpu_groups {
println!("\n [{}]", g.name);
for e in &g.entries {
println!(" {:40} {}", e.label, fmt_us(e.mean_us));
}
}
println!("\n━━━ Saving GPU results ━━━");
let gpu_safe = sanitize(&gpu_name);
let gpu_hw_label = format!("GPU: {gpu_name}");
let gpu_md = format!("{results_dir}/gpu_{gpu_safe}.md");
let gpu_svg = format!("{results_dir}/gpu_{gpu_safe}.svg");
write_markdown(&gpu_md, &gpu_hw_label, &gpu_name, &gpu_groups);
write_svg(&gpu_svg, &gpu_hw_label, &gpu_groups);
let comp = build_comparison(&cpu_groups, &gpu_groups);
if !comp.is_empty() {
println!("\n━━━ CPU vs GPU comparison ━━━");
for cg in &comp {
println!("\n [{}]", cg.key);
println!(" {:38} {:>10} {:>10} {:>8}", "input", "CPU", "GPU", "speedup");
println!(" {}", "─".repeat(72));
for p in &cg.pairs {
let s = p.speedup();
let icon = if s >= 2.0 { " 🚀" } else if s >= 1.0 { " ✓" } else { " ↓" };
println!(
" {:38} {:>10} {:>10} {:>6.2}×{}",
p.label,
fmt_us(p.cpu_us),
fmt_us(p.gpu_us),
s,
icon,
);
}
}
println!("\n━━━ Saving comparison results ━━━");
let comp_slug = cpu_safe.clone();
let comp_md = format!("{results_dir}/comparison_{comp_slug}.md");
let comp_svg = format!("{results_dir}/comparison_{comp_slug}.svg");
let comp_svg_rel = format!("{results_dir}/comparison.svg");
write_comparison_markdown(&comp_md, &cpu_name, &gpu_name, &comp);
write_comparison_svg(&comp_svg_rel, &cpu_name, &gpu_name, &comp);
fs::copy(&comp_svg_rel, &comp_svg).unwrap();
println!(" Wrote {comp_svg} (copy)");
}
}
println!("\n✓ Done. Results are in {results_dir}/");
println!();
let rerun = "cargo run --release --bin bench_report";
let rerun_full = "./bench.sh --criterion";
let w = 62usize;
let line = |s: &str| {
let pad = w.saturating_sub(s.len() + 2);
println!("║ {s}{} ║", " ".repeat(pad));
};
println!("╔{}╗", "═".repeat(w));
line("To rerun:");
line("");
line(" # hardware report (CPU + GPU, ~1 min):");
line(&format!(" {rerun}"));
line("");
line(" # + Criterion statistical suite (~5 min):");
line(&format!(" {rerun_full}"));
println!("╚{}╝", "═".repeat(w));
println!();
}